You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
183 lines
4.5 KiB
Python
183 lines
4.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Download GeoNames cities5000 from https://download.geonames.org/export/dump/ and seed a SQLite database."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import io
|
|
import sqlite3
|
|
import sys
|
|
import urllib.request
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
SOURCE_URL = "https://download.geonames.org/export/dump/cities5000.zip"
|
|
ARCHIVE_ENTRY = "cities5000.txt"
|
|
|
|
CREATE_TABLE_SQL = """
|
|
CREATE TABLE IF NOT EXISTS cities (
|
|
geoname_id INTEGER PRIMARY KEY,
|
|
name TEXT NOT NULL,
|
|
ascii_name TEXT NOT NULL,
|
|
alternate_names TEXT,
|
|
latitude REAL NOT NULL,
|
|
longitude REAL NOT NULL,
|
|
feature_class TEXT,
|
|
feature_code TEXT,
|
|
country_code TEXT,
|
|
cc2 TEXT,
|
|
admin1_code TEXT,
|
|
admin2_code TEXT,
|
|
admin3_code TEXT,
|
|
admin4_code TEXT,
|
|
population INTEGER,
|
|
elevation INTEGER,
|
|
dem INTEGER,
|
|
timezone TEXT,
|
|
modification_date TEXT
|
|
)
|
|
"""
|
|
|
|
CREATE_INDEXES_SQL = (
|
|
"CREATE INDEX IF NOT EXISTS idx_cities_name ON cities(name)",
|
|
"CREATE INDEX IF NOT EXISTS idx_cities_ascii_name ON cities(ascii_name)",
|
|
"CREATE INDEX IF NOT EXISTS idx_cities_alternate_names ON cities(alternate_names)",
|
|
"CREATE INDEX IF NOT EXISTS idx_cities_country_code ON cities(country_code)",
|
|
"CREATE INDEX IF NOT EXISTS idx_cities_population ON cities(population DESC)",
|
|
)
|
|
|
|
INSERT_SQL = """
|
|
INSERT OR REPLACE INTO cities (
|
|
geoname_id,
|
|
name,
|
|
ascii_name,
|
|
alternate_names,
|
|
latitude,
|
|
longitude,
|
|
feature_class,
|
|
feature_code,
|
|
country_code,
|
|
cc2,
|
|
admin1_code,
|
|
admin2_code,
|
|
admin3_code,
|
|
admin4_code,
|
|
population,
|
|
elevation,
|
|
dem,
|
|
timezone,
|
|
modification_date
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
"""
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="Seed GeoNames cities5000 into SQLite")
|
|
parser.add_argument("--db", required=True, help="Target SQLite database path")
|
|
parser.add_argument(
|
|
"--batch-size", type=int, default=1000, help="Insert batch size"
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def to_int(value: str) -> int | None:
|
|
value = value.strip()
|
|
if value == "":
|
|
return None
|
|
return int(value)
|
|
|
|
|
|
def to_float(value: str) -> float:
|
|
return float(value.strip())
|
|
|
|
|
|
def row_from_columns(columns: list[str]) -> tuple:
|
|
return (
|
|
int(columns[0]),
|
|
columns[1],
|
|
columns[2],
|
|
columns[3],
|
|
to_float(columns[4]),
|
|
to_float(columns[5]),
|
|
columns[6],
|
|
columns[7],
|
|
columns[8],
|
|
columns[9],
|
|
columns[10],
|
|
columns[11],
|
|
columns[12],
|
|
columns[13],
|
|
to_int(columns[14]),
|
|
to_int(columns[15]),
|
|
to_int(columns[16]),
|
|
columns[17],
|
|
columns[18],
|
|
)
|
|
|
|
|
|
def download_archive() -> bytes:
|
|
with urllib.request.urlopen(SOURCE_URL) as response:
|
|
return response.read()
|
|
|
|
|
|
def stream_city_rows(archive_bytes: bytes):
|
|
with zipfile.ZipFile(io.BytesIO(archive_bytes)) as zf:
|
|
with zf.open(ARCHIVE_ENTRY) as raw_file:
|
|
for raw_line in raw_file:
|
|
line = raw_line.decode("utf-8").rstrip("\n")
|
|
if not line:
|
|
continue
|
|
|
|
columns = line.split("\t")
|
|
if len(columns) != 19:
|
|
continue
|
|
|
|
yield row_from_columns(columns)
|
|
|
|
|
|
def seed_database(db_path: Path, batch_size: int) -> int:
|
|
archive = download_archive()
|
|
|
|
inserted = 0
|
|
batch = []
|
|
|
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with sqlite3.connect(str(db_path)) as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("PRAGMA journal_mode=WAL")
|
|
cursor.execute(CREATE_TABLE_SQL)
|
|
for sql in CREATE_INDEXES_SQL:
|
|
cursor.execute(sql)
|
|
|
|
for row in stream_city_rows(archive):
|
|
batch.append(row)
|
|
if len(batch) >= batch_size:
|
|
cursor.executemany(INSERT_SQL, batch)
|
|
inserted += len(batch)
|
|
batch.clear()
|
|
|
|
if batch:
|
|
cursor.executemany(INSERT_SQL, batch)
|
|
inserted += len(batch)
|
|
|
|
conn.commit()
|
|
|
|
return inserted
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
try:
|
|
inserted = seed_database(Path(args.db), args.batch_size)
|
|
except Exception as exc: # noqa: BLE001
|
|
print(f"failed to seed cities database: {exc}", file=sys.stderr)
|
|
return 1
|
|
|
|
print(f"seeded {inserted} city rows into {args.db}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|