spacedb/data/process_sbdb.py
2025-12-08 15:12:52 -05:00

128 lines
3.9 KiB
Python
Executable File

#!/usr/bin/env python
import csv
import gzip
import json
import logging
import os
import sys
import django
from django.core.exceptions import ObjectDoesNotExist
from django.db import transaction
from django.utils.text import slugify
current_dir = os.path.dirname(__file__)
parent_dir = os.path.join(current_dir, '../')
sys.path.insert(0, os.path.realpath(parent_dir))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'spacedb.settings_pipeline')
django.setup()
from spaceobjects.models import SpaceObject, OrbitClass, ObjectType
from data.util import get_normalized_full_name, queryset_iterator
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@transaction.atomic
def insert_all(newobjects, delete=False):
if delete:
logger.info('Deleting...')
for obj in SpaceObject.objects.all().only('pk').iterator():
obj.delete()
logger.info('Finished deleting.')
SpaceObject.objects.bulk_create(newobjects, batch_size=499)
def process(reader):
newobjects = []
inserted_once = False
failures_ma = 0
for count, row in enumerate(reader, 1):
if count % 1000 == 0:
logger.info(count)
if count % 1000 == 0:
# Subdivide insertions - slower, but needed for low memory
# environments like production machine
logger.info('Inserting...')
insert_all(newobjects, delete=(not inserted_once))
inserted_once = True
newobjects = []
if not row['ma']:
logger.warn('Missing mean anom: Failed to parse row %d (%s): %s' % \
(count, row.get('full_name', '?'), json.dumps(row)))
failures_ma += 1
continue
fullname = get_normalized_full_name(row['full_name'])
try:
orbit_class = OrbitClass.objects.get(abbrev__iexact=row['class'])
except ObjectDoesNotExist:
orbit_class = None
object_type = ObjectType.from_class(row['class'])
if row['name'] and object_type != ObjectType.COMET:
shortname = row['name'].strip()
else:
shortname = fullname
magnitude = float(row['H']) if row['H'] else None
if not magnitude:
# Comet total magnitude
magnitude = float(row['M1']) if row['M1'] else None
try:
diam = float(row['diameter'].strip())
except:
diam = None
space_object = SpaceObject(
fullname = fullname,
name = shortname,
slug = slugify(fullname.replace('/',' ')),
a = float(row['a']),
e = float(row['e']),
i = float(row['i']),
om = float(row['om']),
w = float(row['w']),
ma = float(row['ma']),
epoch = float(row['epoch']),
is_nea = True if row['neo'] == 'Y' else False,
is_pha = True if row['pha'] == 'Y' else False,
orbit_class = orbit_class,
object_type = object_type,
diameter = diam,
spec_B = row['spec_B'],
spec_T = row['spec_T'],
H = magnitude,
sbdb_entry = row,
)
newobjects.append(space_object)
logger.info('Inserting final records...')
insert_all(newobjects, delete=(not inserted_once))
logger.warning('%d blank mean anomalies' % failures_ma)
logger.info('Done.')
def generate_rows(fields, data):
for row in data:
yield dict(zip(fields, row))
if __name__ == '__main__':
logger.info('Loading sbdb data...')
dir_path = os.path.dirname(os.path.realpath(__file__))
data_path = os.path.realpath(os.path.join(dir_path, 'rawdata/sbdb.json.gz'))
with gzip.open(data_path) as f:
obj = json.load(f)
logger.info('Loaded sbdb data, processing...')
fields = obj['fields']
data = obj['data']
rows = generate_rows(fields, data)
process(rows)