import os
from datetime import datetime, timedelta
from pytz import timezone
from warnings import warn, filterwarnings
from tqdm import tqdm
import numpy as np
import pandas as pd
import pickle
from sklearn.neighbors import BallTree
from geopandas import GeoDataFrame
from scipy.special import comb
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from functools import partial
from . import MPLSTYLE, ROOT_PATH
from .utils import reconcile_input
from .sources import glm_website, usg, pipeline, gmn, csv, remote
from . import fov_utils as fu
from .astro_utils import haversine, _distance_metric
_FIRST_COLS = ['datetime', 'longitude', 'latitude', 'source', 'detectedBy',
'confidenceRating', 'confidence', 'lightcurveStructure', 'energy',
'integrated_energy_g16', 'integrated_energy_g17', 'integrated_energy_g18', 'integrated_energy_g19'
'peak_energy_g16', 'peak_energy_g17', 'peak_energy_g18', 'peak_energy_g19',
'peak_energy_cat_g16', 'peak_energy_cat_g17', 'peak_energy_cat_g18', 'peak_energy_cat_g19',
'impact-e', 'alt', 'vel']
utc = timezone('UTC')
[docs]
class BolideDataFrame(GeoDataFrame):
"""
Subclass of GeoPandas `~geopandas.GeoDataFrame` with additional bolide-specific methods.
Parameters
----------
source : str
Specifies the source for the initialized. Can be:
- ``'glm'``: initialize from neo-bolide-ndc.nasa.gov data
- ``'glm-pipeline'`` to initialize from ZODB database files from the GLM detection pipeline.
- ``'pickle'``: initialize from a pickled GeoDataFrame
- ``'csv'``: initialize from a .csv file
- ``'usg'``: initialize from US Government data at cneos.jpl.nasa.gov/fireballs/
- ``'gmn'``: initialize from Global Meteor Networ data at globalmeteornetwork.org/data/
files : str, list
Specifies files to be used depending on source.
- For ``'pickle'``, specifies the filename of the pickled object.
- For ``'csv'``, specifies the filename of the csv.
- For ``'glm-pipeline'``, specifies the filename(s) of the database file(s)
url : str
Specifies remote url to be used with ``'remote'``.
date : str, date, datetime
Specifies a date when using Global Meteor Network data.
str can be either yyyy-mm or yyyy-mm-dd.
annotate : bool
Whether or not to add additional metadata.
rearrange : bool
Whether or not to rearrange the columns, if coming from a CSV or Pickle.
"""
[docs]
def __init__(self, *args, **kwargs):
if len(args)==0 and len(kwargs)==0:
kwargs['source'] = 'glm'
if 'source' not in kwargs:
return super().__init__(*args, **kwargs)
defaults = {'files': None, 'annotate': True, 'rearrange': False}
kwargs = reconcile_input(kwargs, defaults)
source = kwargs['source']
files = kwargs['files']
annotate = kwargs['annotate']
rearrange = kwargs['rearrange']
# input standardization
source = source.lower()
if type(files) is str:
files = [files]
# input validation
valid_sources = ['website', 'glm', 'usg', 'pickle', 'gmn', 'csv', 'pipeline',
'glm-pipeline', 'usg-orbits', 'glm-orbits', 'remote']
if source not in valid_sources:
raise ValueError("Source \""+str(source)+"\" is unsupported. Please use one of "+str(valid_sources))
if source in ['pickle', 'csv', 'glm-pipeline', 'pipeline'] and files is None:
raise ValueError("Files must be specified for the given source \""+source+"\"")
if source in ['pickle', 'csv', 'glm-pipeline', 'pipeline'] and not all([os.path.isfile(f) for f in files]):
paths = "\n".join([os.path.abspath(f) for f in files])
raise ValueError("At least one of the files specified does not exist.\
Here are their absolute paths:\n"+paths)
if source in ['pickle', 'csv'] and len(files) > 1:
warn("More than one file given for source \""+source+"\". Only the first one will be used.")
# Initialize differently based on source.
# Each if statement creates a GeoDataFrame with the EPSG:4326 CRS
if source in ['website', 'glm']:
source = 'glm'
init_gdf = glm_website()
elif source == 'usg':
init_gdf = usg()
elif source == 'gmn':
if 'date' not in kwargs:
raise ValueError('Must specify a yyyy-mm or yyyy-mm-dd date to use GMN data')
init_gdf = gmn(kwargs['date'], loc_mode='begin')
elif source == 'pickle':
with open(files[0], 'rb') as pkl:
init_gdf = pickle.load(pkl)
elif source == 'csv':
init_gdf = csv(files[0])
elif source == 'remote':
if 'url' not in kwargs:
raise ValueError('Must specify a url with url=...')
url = kwargs['url']
init_gdf = remote(url)
elif source == 'usg-orbits':
init_gdf = remote('https://aozerov.com/data/usg-orbits.csv')
annotate = False
# elif source == 'glm-orbits':
# init_gdf = csv(file=ROOT_PATH+'/../notebooks/glm-orbits.csv')
# annotate = False
elif source in ['glm-pipeline', 'pipeline']:
source = 'glm-pipeline'
if 'min_confidence' in kwargs:
min_confidence = kwargs['min_confidence']
else:
min_confidence = 0
init_gdf = pipeline(files=files, min_confidence=min_confidence)
init_gdf['source'] = source
# rearrange columns, respecting original order if csv or pickle
if source not in ['csv', 'pickle'] or rearrange is True:
first_cols = [col for col in _FIRST_COLS if col in init_gdf.columns]
other_cols = [col for col in init_gdf.columns if col not in first_cols]
init_gdf = init_gdf[first_cols + other_cols]
# initialize the super-class (GeoDataFrame) using the created init_gdf
super().__init__(init_gdf)
# add additional metadata to the bolides
if annotate:
self.annotate()
descriptions = pd.read_csv(ROOT_PATH+'/metadata/columns.csv', index_col='column')
self.descriptions = descriptions[descriptions.sources.isin([source, 'all'])]
def annotate(self):
"""Add metadata to bolide detections"""
from .astro_utils import get_phase, get_solarhour, get_sun_alt
# lunar phase
self['phase'] = [get_phase(dt) for dt in self['datetime']]
# moon fullness
self['moon_fullness'] = -np.abs(self['phase']-0.5)*2+1
# solar hour
self['solarhour'] = [get_solarhour(data[0], data[1]) for data in zip(self['datetime'], self['longitude'])]
# solar altitude
sun_alt = np.array([get_sun_alt(dt=row['datetime'].to_pydatetime(), lat=row['latitude'], lon=row['longitude']) for _, row in self.iterrows()])
self['sun_alt_obs'] = sun_alt[:, 0]
self['sun_alt_app'] = sun_alt[:, 1]
self['date_retrieved'] = datetime.now()
def describe(self, key=None):
"""Describe a variable of the BolideDataFrame
Parameters
----------
key : str, iterable, or None
The variables to be described.
If str, describes that variable.
If iteralbe, describe all variable strings in the iterable.
If None, describe all variables in the BolideDataFrame.
Returns
-------
str :
A string describing the variable(s).
"""
if type(key) is str:
key = [key]
to_describe = self.columns if key is None else key
if 'source' in self.columns and len(self)>0:
source = self['source'].iloc[0]
else:
source = ''
sources = ['all', source]
for column in to_describe:
description = self.descriptions['description'][column] if column in self.descriptions.index else ""
print(column+":", description)
[docs]
def filter_date(self, start=None, end=None, inplace=False):
"""Filter bolides by date.
Filters the `~BolideDataFrame` using dates given in ISO format.
`start` or `end` can be left unspecified to only bound the dates on one end.
Parameters
----------
start, end: str
ISO-format strings that can be read by `~datetime.datetime.fromisoformat`.
If the timezone is not specified, it is assumed to be in UTC.
inplace: bool
If True, the `~BolideDataFrame` of this method is altered. If False, it is not, so the returned
`~BolideDataFrame` must be used.
Returns
-------
`~BolideDataFrame`
"""
new_bdf = self
# drop data before start date, if specified
if start is not None:
dt = datetime.fromisoformat(start)
# if timezone not given, make the datetime timezone-aware
# with UTC assumed as the timezone
if dt.tzname() is None:
dt = utc.localize(dt)
to_drop = self.datetime < dt
new_bdf = new_bdf.drop(new_bdf.index[to_drop], inplace=inplace)
if inplace:
new_bdf = self
# drop data after end date, if specified
if end is not None:
dt = datetime.fromisoformat(end)
# if timezone not given, make the datetime timezone-aware
# with UTC assumed as the timezone
if dt.tzname() is None:
dt = utc.localize(dt)
to_drop = new_bdf.datetime > dt
new_bdf = new_bdf.drop(new_bdf.index[to_drop], inplace=inplace)
if inplace:
new_bdf = self
# force the class, which gets converted to GeoDataFrame at some point
# in the drop methods above
force_bdf_class(new_bdf)
return new_bdf
[docs]
def get_closest_by_time(self, datestr, n=1):
"""Get the n bolides closest to a given iso-format date string
Parameters
----------
datestr : str
ISO-format string that can be read by `~datetime.datetime.fromisoformat`.
If the timezone is not specified, it is assumed to be in UTC.
n : int
Number of closest detections to return.
Returns
-------
bdf : `~BolideDataFrame`
The filtered data.
"""
dt = datetime.fromisoformat(datestr)
# if timezone not given, make the datetime timezone-aware
# with UTC assumed as the timezone
if dt.tzname() is None:
dt = utc.localize(dt)
return self.iloc[(self['datetime'] - dt).abs().argsort()].head(n)
[docs]
def get_closest_by_loc(self, lon, lat, n=1):
"""Get the n bolides closest to a given longitude and latitude
Parameters
----------
lon, lat : int
Longitude and latitude in degrees to search for bolides near.
n : int
Number of closest detections to return.
Returns
-------
bdf : `~BolideDataFrame`
The filtered data.
"""
distances = haversine(lat, lon, self['latitude'].values, self['longitude'].values)
return self.iloc[distances.argsort()].head(n)
def get_closest(self, datestr=None, lon=None, lat=None, k=1):
"""
Get the n bolides closest to a given iso-format date string and to a given
longitude and latitude in degrees. This is calculated using BallTree and a
custom distance metric that finds the expected number of pairs of bolides
that would be at least as close in space and time as the two points being
compared, under a random (uniform) distribution. Effectively, given a time
and location, this will find and cluster bolide events that are nearest to it.
Parameters
----------
datestr : str
ISO-format string that can be read by `~datetime.datetime.fromisoformat`.
If the timezone is not specified, it is assumed to be in UTC.
lon, lat : int
Longitude and latitude in degrees to search for bolides near.
k : int
Number of closest detections to return.
Returns
-------
bdf : `~BolideDataFrame`
The filtered data.
"""
if datestr is None and (lon is not None and lat is not None):
return self.get_closest_by_loc(lon, lat, n=k)
elif lon is None and lat is None and datestr is not None:
return self.get_closest_by_time(datestr, n=k)
elif lon is None or lat is None:
raise ValueError("Must specify both longitude and latitude, or a date string, or all three.")
dt = datetime.fromisoformat(datestr)
if dt.tzinfo is None:
dt = utc.localize(dt)
query_row = {
'datetime': dt,
'latitude': lat,
'longitude': lon
}
# Make a deep copy of self to avoid modifying the original
bdf = self.copy(deep=True)
# Prepend the query row
bdf = pd.concat([pd.DataFrame([query_row]), bdf], ignore_index=True)
# Compute time_seconds for all rows
bdf['time_seconds'] = (bdf['datetime'] - bdf['datetime'].min()).dt.total_seconds()
poly_goes = fu.get_boundary('goes', collection=True, intersection=False, crs=None)
fov_goes = poly_goes.area / 1e6
min_time = self['datetime'].min()
max_time = self['datetime'].max()
total_time = (max_time - min_time).total_seconds()
n = len(self)
metric = partial(_distance_metric, fov_goes=fov_goes, total_time=total_time, n=n)
X = bdf[['time_seconds', 'latitude', 'longitude']].to_numpy()
tree = BallTree(X, metric=metric)
dists, inds = tree.query(X[:1], k=k+1)
inds = inds[0][1:] # Exclude the first index (the query point itself)
cols = list(bdf.columns)
a, b = cols.index('latitude'), cols.index('longitude')
cols[b], cols[a] = cols[a], cols[b]
bdf = bdf[cols]
return bdf.iloc[inds]
[docs]
def filter_boundary(self, boundary, intersection=False, interior=True):
"""Filter data to only points within specified boundaries.
Parameters
----------
boundary: str or list of str
The boundary (boundaries) to filter the `~BolideDataFrame` by.
Refer to `~bolides.fov_utils.get_boundary`.
intersection: bool
- True: keep detections within the union of all boundaries given.
- False: keep detections within the intersection of all boundaries given.
interior: bool
- True: default behavior.
- False: keep detections outside the union or intersection of the boundaries, instead of inside.
Returns
-------
`~BolideDataFrame`
The filtered `~BolideDataFrame`
"""
import pyproj
# define Azimuthal Equidistant projection
aeqd = pyproj.Proj(proj='aeqd', ellps='WGS84', datum='WGS84', lat_0=90, lon_0=0).srs
# put bdf into the correct CRS
crs = self.geometry.crs
bdf = self.to_crs(aeqd)
from .fov_utils import get_boundary
final_polygon = get_boundary(boundary, intersection=intersection, collection=False)
# clip with the polygon, which is in Azimuthal Equidistant
points_in_poly = [pt.within(final_polygon) for pt in bdf['geometry']]
if interior:
bdf = bdf[points_in_poly]
else:
bdf = bdf[~points_in_poly]
# project bdf back to original CRS
bdf = bdf.to_crs(crs)
force_bdf_class(bdf)
return bdf
[docs]
def filter_observation(self, sensors, intersection=False):
"""Filter data to only points observable (in time and space) by a given sensor.
Parameters
----------
sensors: str or list of str
The sensor(s) to filter the `~BolideDataFrame` by:
- ``'GLM-16'``: The Geostationary Lightning Mapper aboard GOES-16.
- ``'GLM-17'``: The Geostationary Lightning Mapper aboard GOES-17.
Note that the biannual yaw flips of GOES-17 are taken into account.
- ``'GLM-18'``: The Geostationary Lightning Mapper aboard GOES-18.
- ``'GLM-19'``: The Geostationary Lightning Mapper aboard GOES-19.
The shorthand ``'G16'`` … ``'G19'`` can also be used.
intersection: bool
If True, filter for bolides observable by all sensors given.
If False, filter for bolides observable by any sensors given.
Returns
-------
`~BolideDataFrame`
The filtered `~BolideDataFrame`
"""
# input standardization and validation
if type(sensors) is str:
sensors = [sensors]
if type(sensors) is not list or not all([type(s) is str for s in sensors]):
raise ValueError("Sensors must be a string or list of strings")
sensors = [sensor.lower() for sensor in sensors]
valid_sensors = ['g16', 'g17', 'g18', 'g19', 'glm-16', 'glm-17', 'glm-18', 'glm-19']
indices = []
for num, sensor in enumerate(sensors):
# resolve the sensor to a GOES satellite and load its observation
# windows + FOVs (shared with fov_utils.get_satellites)
try:
satellite = fu._normalize_satellite(sensor)
except ValueError:
raise ValueError("Invalid sensor \""+sensor+"\". sensors must be in "+str(valid_sensors))
bdfs = []
# read csv defining the observation times and FOV
fov_df = fu.get_obs_intervals(satellite)
# for each observation time + FOV, filter_date by the observation
# times, filter_boundary by the FOV, and append result to the list
# of BolideDataFrames
for i in range(len(fov_df)):
start = None if fov_df.start.isna()[i] else fov_df.start[i]
end = None if fov_df.end.isna()[i] else fov_df.end[i]
boundary = fov_df.boundary[i]
bdfs.append(self.filter_boundary([boundary]).filter_date(start=start, end=end))
bdf = pd.concat(bdfs)
if not intersection:
indices += list(bdf.index)
elif num == 0:
indices = list(bdf.index)
else:
indices = [idx for idx in indices if idx in list(bdf.index)]
# get a list of unique indices, and sort them
indices = list(set(indices))
indices.sort()
# get the final filtered BolideDataFrame and return
filtered = self.loc[indices, :]
force_bdf_class(filtered)
return filtered
[docs]
def filter_shower(self, shower=None, years=None, padding=1, sdf=None, exclude=False):
"""Filter data to only points observable (in time and space) by a given sensor.
Parameters
----------
shower: str, int, list of str, or list of int
The meteor shower(s) to filter by. Can enter either IAU number,
IAU 3-letter code, or full shower name. Refer to the IAU Meteor
Data center at https://www.ta3.sk/IAUC22DB/MDC2007/.
years: int or list of int
The shower years to include. Default is to filter for all occurrences
of the shower(s) in the BolideDataFrame
padding: int
The number of days to pad around the peak time.
sdf: ShowerDataFrame
Optionally provide a ShowerDataFrame to use for filtering
exclude: bool
Whether or not to exclude bolides around the given showers. Default is to include.
Returns
-------
`~BolideDataFrame`
The filtered `~BolideDataFrame`
"""
if years is None:
years = list(range(min(self.datetime).year-1, max(self.datetime).year+1))
elif type(years) is int:
years = [years]
if sdf is None:
if hasattr(self, '_showers'):
sdf = self._showers
else:
from . import ShowerDataFrame
sdf = ShowerDataFrame()
self._showers = sdf
dates = sdf.get_dates(shower, years).datetime
date_padding = timedelta(days=padding)
date_ranges = [[d-date_padding, d+date_padding] for d in dates]
counts = np.sum(np.array([list(self.datetime.between(d[0], d[1])) for d in date_ranges]), axis=0)
if not exclude:
good_locs = counts != np.zeros(len(counts))
else:
good_locs = counts == np.zeros(len(counts))
return self[good_locs]
[docs]
def plot_detections(self, category=None, *args, **kwargs):
"""Plot detections of bolides.
Reprojects the geometry of bdf to the crs given, and scatters the points
on a cartopy map. kwargs are passed through to matplotlib's scatter.
Parameters
----------
crs : `~cartopy.crs.CRS`
The map projection to use. Refer to
https://scitools.org.uk/cartopy/docs/latest/reference/projections.html.
boundary : str or list of str
The boundaries to plot.
Refer to `~bolides.fov_utils.get_boundary`.
category : str
The name of a categorical column in the `~BolideDataFrame`
**kwargs :
Keyword arguments passed through to `~matplotlib.pyplot.scatter`.
Other Parameters
----------------
coastlines: bool
Whether or not to draw coastlines.
style : str
The matplotlib style to use. Refer to
https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html
boundary_style : dict
The kwargs to use when plotting the boundary.
Refer to `~cartopy.mpl.geoaxes.GeoAxes.add_geometries`.
figsize : tuple
The size (width, height) of the plotted figure.
Returns
-------
fig : `~matplotlib.pyplot.figure`
ax : `~cartopy.mpl.geoaxes.GeoAxesSubplot`
"""
# filter to only detections containing both latitude and longitude
bdf = self[(~self.latitude.isnull()) & (~self.longitude.isnull())]
if category is not None:
category = self[category]
from .plotting import plot_scatter
fig, ax = plot_scatter(bdf.latitude, bdf.longitude, category=category, *args, **kwargs)
return fig, ax
[docs]
def plot_interactive(self, *args, **kwargs):
"""Plot an interactive map of bolide detections.
Parameters
----------
mode : str
Either ``'earth'`` or ``'radiant'``.
``'earth'`` plots locations on the Earth,
``'radiant'`` plots locations in the sky.
projection : str
The map projection to use. Here is the complete list:
[``'eckert4'``, ``'goes-e'``, ``'goes-w'``, ``'fy4a'``,
``'airy'``, ``'aitoff'``, ``'albers'``, ``'albers usa'``, ``'august'``,
``'azimuthal equal area'``, ``'azimuthal equidistant'``, ``'baker'``,
``'bertin1953'``, ``'boggs'``, ``'bonne'``, ``'bottomley'``, ``'bromley'``,
``'collignon'``, ``'conic conformal'``, ``'conic equal area'``, ``'conic equidistant'``,
``'craig'``, ``'craster'``, ``'cylindrical equal area'``,
``'cylindrical stereographic'``, ``'eckert1'``, ``'eckert2'``,
``'eckert3'``, ``'eckert4'``, ``'eckert5'``, ``'eckert6'``, ``'eisenlohr'``,
``'equirectangular'``, ``'fahey'``, ``'foucaut'``, ``'foucaut sinusoidal'``,
``'ginzburg4'``, ``'ginzburg5'``, ``'ginzburg6'``,
``'ginzburg8'``, ``'ginzburg9'``, ``'gnomonic'``, ``'gringorten'``,
``'gringorten quincuncial'``, ``'guyou'``, ``'hammer'``, ``'hill'``,
``'homolosine'``, ``'hufnagel'``, ``'hyperelliptical'``,
``'kavrayskiy7'``, ``'lagrange'``, ``'larrivee'``, ``'laskowski'``,
``'loximuthal'``, ``'mercator'``, ``'miller'``, ``'mollweide'``, ``'mt flat polar parabolic'``,
``'mt flat polar quartic'``, ``'mt flat polar sinusoidal'``,
``'natural earth'``, ``'natural earth1'``, ``'natural earth2'``,
``'nell hammer'``, ``'nicolosi'``, ``'orthographic'``,
``'patterson'``, ``'peirce quincuncial'``, ``'polyconic'``,
``'rectangular polyconic'``, ``'robinson'``, ``'satellite'``, ``'sinu mollweide'``,
``'sinusoidal'``, ``'stereographic'``, ``'times'``,
``'transverse mercator'``, ``'van der grinten'``, ``'van der grinten2'``,
``'van der grinten3'``, ``'van der grinten4'``,
``'wagner4'``, ``'wagner6'``, ``'wiechel'``, ``'winkel tripel'``,
``'winkel3'``]
boundary : str or list of str
The boundaries to plot.
Refer to `~bolides.fov_utils.get_boundary`.
color : str
The name of a column in the `~BolideDataFrame`
logscale : bool
Whether or not to use a logarithmic scale for the colors.
**kwargs :
Keyword arguments passed through to `~plotly.express.scatter_geo`.
Other Parameters
----------------
coastlines: bool
Whether or not to draw coastlines.
style : str
The matplotlib style to use. Refer to
https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html
boundary_style : dict
The kwargs to use when plotting the boundary.
Refer to `~cartopy.mpl.geoaxes.GeoAxes.add_geometries`.
figsize : tuple
The size (width, height) of the plotted figure.
culture : str
When mode is ``'radiant'``, the asterism source culture to use. Here is the complete list:
[``'anutan'``, ``'aztec'``, ``'belarusian'``, ``'boorong'``,
``'chinese'``, ``'chinese_contemporary'``, ``'chinese_medieval'``,
``'egyptian'``, ``'hawaiian_starlines'``, ``'indian'``,
``'inuit'``, ``'japanese_moon_stations'``, ``'korean'``,
``'lokono'``, ``'macedonian'``, ``'maori'``, ``'maya'``,
``'mongolian'``, ``'navajo'``, ``'norse'``, ``'northern_andes'``,
``'romanian'``, ``'russian_siberian'``, ``'sami'``,
``'sardinian'``, ``'tongan'``, ``'tukano'``, ``'tupi'``,
``'western'``, ``'western_SnT'``, ``'western_hlad'``,
``'western_rey'``]
Returns
-------
fig : `~matplotlib.pyplot.figure`
ax : `~cartopy.mpl.geoaxes.GeoAxesSubplot`
"""
from .plotting import plot_interactive
return plot_interactive(self, *args, **kwargs)
[docs]
def plot_density(self, *args, **kwargs):
"""Plot bolide detection density.
Density is computed using scikit-learn's `~sklearn.neighbors.KernelDensity`
using the haversine distance metric (as the data is in longitude and latitude)
and gaussian kernel by default.
It is then gridded, projected, and plotted.
Parameters
----------
crs : `~cartopy.crs.CRS`
The map projection to use. Refer to
https://scitools.org.uk/cartopy/docs/latest/reference/projections.html.
bandwidth : float
The bandwidth of the Kernel Density Estimator, in degrees.
boundary : str or list of str
The boundaries to plot and clip the density by.
Refer to `~bolides.fov_utils.get_boundary`.
n_levels : int
Number of discrete density levels to plot.
lat_resolution, lon_resolution : ints
The number of discrete latitude and longitude levels when gridding the density.
**kwargs :
Keyword arguments passed through to `~cartopy.mpl.geoaxes.GeoAxes.contourf`.
Other Parameters
----------------
coastlines : bool
Whether or not to draw coastlines.
style : str
The matplotlib style to use. Refer to
https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html
boundary_style : dict
The kwargs to use when plotting the boundary.
Refer to `~cartopy.mpl.geoaxes.GeoAxes.add_geometries`.
kde_params : dict
The kwargs to pass to `~sklearn.neighbors.KernelDensity`.
Note that 'metric' is not allowed to be specified, as haversine
is the only valid metric.
figsize : tuple
The size (width, height) of the plotted figure.
Returns
-------
fig : `~matplotlib.pyplot.figure`
ax : `~cartopy.mpl.geoaxes.GeoAxesSubplot`
"""
# filter to only detections containing both latitude and longitude
bdf = self[(~self.latitude.isnull()) & (~self.longitude.isnull())]
from .plotting import plot_density
fig, ax = plot_density(bdf.latitude, bdf.longitude, *args, **kwargs)
return fig, ax
[docs]
def plot_dates(self, freq='1D', logscale=False,
start=None, end=None,
figsize=(10, 3), style=MPLSTYLE,
showers=None, line_style={}, **kwargs):
"""Plot the number of bolides over time.
Parameters
----------
freq : str
The binning frequency. Can be strings like ``'2D'``, ``'1M'``, etc.
Refer to https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
start, end: strs
Optional arguments to filter by date.
Must be ISO-format strings that can be read by `~datetime.datetime.fromisoformat`.
logscale : bool
Whether or not to use a log-scale for the y-axis
**kwargs :
Keyword arguments passed through to `~matplotlib.axes.Axes.bar`.
Other Parameters
----------------
style : str
The matplotlib style to use. Refer to
https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html
showers : ShowerDataFrame
A ShowerDataFrame to use for the shower data. By default, showers are pulled
from the established showers list at the IAU meter data center.
figsize : tuple
The size (width, height) of the plotted figure.
line_style : dict
The matplotlib style arguments for the vertical shower lines.
Returns
-------
fig : `~matplotlib.pyplot.figure`
ax : `~matplotlib.axes.Axes`
"""
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
# filter date to given start and end times
bdf = self.filter_date(start, end, inplace=False)
datetimes = bdf.datetime
# get counts according to the given frequency
index = bdf.groupby(pd.Grouper(key='datetime', freq=freq)).count().index
if showers is not None:
# a ShowerDataFrame is stored in the _showers attribute, so that
# users can make multiple plots without constantly recreating it
if hasattr(self, '_showers'):
sdf = self._showers
else:
from . import ShowerDataFrame
sdf = ShowerDataFrame()
self._showers = sdf
date_info = sdf.get_dates(showers, range(min(bdf.datetime).year-1, max(bdf.datetime).year+1))
shower_names = date_info['shower name']
shower_dates = date_info.datetime
# with the given matplotlib style,
with plt.style.context(style):
# initialize figure and set major, minor ticks and formats
fig, ax = plt.subplots(figsize=figsize, dpi=300)
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_minor_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))
# make minor ticks (months) small, pad major ticks (years)
plt.tick_params(axis='x', which='minor', labelsize=5, pad=0, labeltop=True)
plt.tick_params(axis='x', which='major', pad=3, labeltop=True)
# set x-label, adding year data if data is only within one year
# (and hence no year data on x-axis)
ax.set_xlabel("Date")
if min(bdf.datetime).year == max(bdf.datetime).year:
ax.set_xlabel("Date (month in "+str(min(bdf.datetime).year)+")")
ax.set_ylabel("# Events")
plt.hist(datetimes, bins=index, **kwargs)
# set x-limits to data limits
x_min = datetime.fromisoformat(start) if start is not None else min(bdf.datetime)
x_max = datetime.fromisoformat(end) if end is not None else max(bdf.datetime)
plt.xlim(x_min, x_max)
if showers is not None:
names = list(shower_names)
unique = list(shower_names.unique())
import matplotlib.cm as cm
cmap = cm.Set2
defaults = {'linestyle': 'dashed', 'linewidth': 1,
'alpha': 0.5}
line_kwargs = reconcile_input(line_style, defaults)
for name, date in zip(shower_names, shower_dates):
plt.axvline(x=date, color=cmap(unique.index(name)/8),
label=name, **line_kwargs)
# remove extra labels for repeat showers
for i, p in enumerate(ax.get_lines()):
if p.get_label() in names[:i]:
p.set_label('_' + p.get_label())
plt.legend(loc='upper right')
# make y-axis logscale if specified
if logscale:
ax.set_yscale('log')
return fig, ax
[docs]
def add_website_data(self, ids=None):
"""Pull light curve and integrated energy data from neo-bolide.ndc.nasa.gov.
Downloads light curve data from neo-bolide.ndc.nasa.gov, placing it
into the BolideDataFrame as a column of `~lightkurve.LightCurveCollection` objects.
The column is named ``'lightcurves'``. Integrated energy data is also downloaded
for different satellites and placed into columns starting with ``'energy_'``.
This method will only work with BolideDataFrames having an ``_id`` column containing
IDs from GLM bolide detections at neo-bolide.ndc.nasa.gov.
Parameters
----------
ids : list of str
Optional list of strings representing the bolide ID's that additional data is
needed for. If not used, data is added for every bolide in the BolideDataFrame.
"""
# if there is no _id column, we can't associate the website data to
# the bolides in the BolideDataFrame
assert '_id' in self.columns, "BolideDataFrame must have an '_id' column"
# check that ids is not a single string
assert type(ids) is not str, "Input must be must be a list"
# make ids a List if it is an iterable
if hasattr(ids, '__iter__'):
ids = list(ids)
# check that all the given ids actually exist
assert all([_id in list(self._id) for _id in ids]), "All given IDs must exist in the BolideDataFrame"
# if no ids are given, we assume that we will obtain website data for each
# event in the BolideDataFrame
if ids == None:
ids = list(self._id)
# at this point, unless the input was not iterable, ids should be a list
assert type(ids) is list, "Input must be a list"
# get the data from the website by passing the ids
from .sources import glm_website_event
data = glm_website_event(ids)
# initialize empty columns in the BolideDataFrame
cols = data.keys()
for col in cols:
if col == 'lightcurves':
self[col] = None
else:
self[col] = np.nan
# iterate through the list of ids, plugging the obtained data
# into the appropriate rows in the BolideDataFrame
for i, _id in enumerate(ids):
# find the row containing the id
idx = self.index[np.argmax(self._id == _id)]
# enter the data
for col in cols:
self.loc[idx, col] = data[col][i]
# TODO: match on a column other than _id?
[docs]
def augment(self, new_data, time_limit=300, score_limit=5, intersection=False, outer=False):
"""Augment BolideDataFrame with data from another source
Parameters
----------
new_data : `~BolideDataFrame`
The source of the new data.
time_limit : int
Maximum time difference (seconds) for two bolides to possibly be the same
score_limit : int
Maximum score (time_limit * difference in latlon) for two bolides to
possibly be the same.
intersection : bool
- True: only keep detections that appear in both sets of data.
- False: keep all detections that appear in the first set.
outer : bool
- True: keep detections from both sets of data.
- False: keep all detections that appear in the first set.
Returns
-------
bdf : `~BolideDataFrame`
"""
# make _id column if doesn't exist
if "_id" not in self.columns:
self["_id"] = np.arange(len(self))
# iterate through rows of self, computing distance and identifying
# detections with detections in the other data if certain closeness
# criteria based on time and distance are met
ids = [""] * len(new_data)
for num, row in tqdm(self.iterrows(), "Augmenting data", total=len(self)):
deltas = row['datetime']-new_data['datetime']
s_deltas = np.abs([delta.total_seconds() for delta in deltas])
closest = np.argmin(s_deltas)
if s_deltas[closest] < time_limit:
lat_diff = abs(row['latitude']-new_data['latitude'].iloc[closest])
lon_diff = abs(row['longitude'] % 360 - new_data['longitude'].iloc[closest] % 360)
geo_diff = lat_diff+lon_diff
s_diff = s_deltas[closest]
score = geo_diff * s_diff
if score < score_limit:
ids[closest] = row['_id']
new_data['_id'] = ids
# merge the data. If taking the intersection, only keep rows in
# original BolideDataFrame that have a corresponding detection in
# the other source
if intersection:
merged = self.merge(new_data, 'inner', on='_id')
elif outer:
merged = self.merge(new_data, 'outer', on='_id')
else:
merged = self.merge(new_data, 'left', on='_id')
# rename the required columns
for col in merged.columns:
if col.endswith('_x') and col[:-2]+"_y" in merged.columns:
merged[col[:-2]] = merged[col]
del merged[col]
# force class
merged.__class__ = BolideDataFrame
return merged
def __setattr__(self, attr, val):
# since BolideDataFrame can have non-column attributes, we can ignore
# Pandas' warnings about setting attributes
filterwarnings("ignore", message="Pandas doesn't allow columns to be created via a new attribute name")
return super().__setattr__(attr, val)
# override GeoPandas' __getitem__ to force the class to remain BolideDataFrame
def __getitem__(self, key):
result = super().__getitem__(key)
force_bdf_class(result)
return result
@property
def _constructor(self):
return BolideDataFrame
# define how the BolideDataFrame renders in an IPython notebook
def _repr_html_(self):
# want to show all columns
with pd.option_context('display.max_columns', None):
# get the default representation from Pandas
df_rep = super()._repr_html_()
good_sources = ['glm', 'usg', 'glm-pipeline']
attribution = ""
# get the attribution HTML file
if 'source' in self.columns and len(self) > 0:
source = self['source'].iloc[0]
if source in good_sources:
with open(ROOT_PATH + '/metadata/' + source + '.html', 'r') as f:
attribution = f.read()
if 'source_y' in self.columns and self['source_y'].iloc[0] in good_sources:
source = self['source_y'].iloc[0]
attribution += "Augmented with "
with open(ROOT_PATH + '/metadata/' + source + '.html', 'r') as f:
attribution += f.read()
return attribution + df_rep
def force_bdf_class(bdf):
if isinstance(bdf, GeoDataFrame):
bdf.__class__ = BolideDataFrame