Source code for bolides.bdf

import os
from datetime import datetime, timedelta
from pytz import timezone
from warnings import warn, filterwarnings
from tqdm import tqdm

import numpy as np
import pandas as pd
import pickle
from sklearn.neighbors import BallTree
from geopandas import GeoDataFrame
from scipy.special import comb

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from functools import partial

from . import MPLSTYLE, ROOT_PATH
from .utils import reconcile_input
from .sources import glm_website, usg, pipeline, gmn, csv, remote
from . import fov_utils as fu
from .astro_utils import haversine, _distance_metric

_FIRST_COLS = ['datetime', 'longitude', 'latitude', 'source', 'detectedBy',
               'confidenceRating', 'confidence', 'lightcurveStructure', 'energy',
               'integrated_energy_g16', 'integrated_energy_g17', 'integrated_energy_g18', 'integrated_energy_g19'
               'peak_energy_g16', 'peak_energy_g17', 'peak_energy_g18', 'peak_energy_g19',
               'peak_energy_cat_g16', 'peak_energy_cat_g17', 'peak_energy_cat_g18', 'peak_energy_cat_g19',
               'impact-e', 'alt', 'vel']

utc = timezone('UTC')


[docs]
class BolideDataFrame(GeoDataFrame):
    """
    Subclass of GeoPandas `~geopandas.GeoDataFrame` with additional bolide-specific methods.

    Parameters
    ----------
    source : str
        Specifies the source for the initialized. Can be:

        - ``'glm'``: initialize from neo-bolide-ndc.nasa.gov data
        - ``'glm-pipeline'`` to initialize from ZODB database files from the GLM detection pipeline.
        - ``'pickle'``: initialize from a pickled GeoDataFrame
        - ``'csv'``: initialize from a .csv file
        - ``'usg'``: initialize from US Government data at cneos.jpl.nasa.gov/fireballs/
        - ``'gmn'``: initialize from Global Meteor Networ data at globalmeteornetwork.org/data/
    files : str, list

        Specifies files to be used depending on source.

        - For ``'pickle'``, specifies the filename of the pickled object.
        - For ``'csv'``, specifies the filename of the csv.
        - For ``'glm-pipeline'``, specifies the filename(s) of the database file(s)
    url : str
        Specifies remote url to be used with ``'remote'``.
    date : str, date, datetime
        Specifies a date when using Global Meteor Network data.
        str can be either yyyy-mm or yyyy-mm-dd.

    annotate : bool
        Whether or not to add additional metadata.
    rearrange : bool
        Whether or not to rearrange the columns, if coming from a CSV or Pickle.
    """
    

[docs]
    def __init__(self, *args, **kwargs):

        if len(args)==0 and len(kwargs)==0:
            kwargs['source'] = 'glm'
        if 'source' not in kwargs:
            return super().__init__(*args, **kwargs)

        defaults = {'files': None, 'annotate': True, 'rearrange': False}
        kwargs = reconcile_input(kwargs, defaults)
        source = kwargs['source']
        files = kwargs['files']
        annotate = kwargs['annotate']
        rearrange = kwargs['rearrange']

        # input standardization
        source = source.lower()
        if type(files) is str:
            files = [files]

        # input validation
        valid_sources = ['website', 'glm', 'usg', 'pickle', 'gmn', 'csv', 'pipeline',
                         'glm-pipeline', 'usg-orbits', 'glm-orbits', 'remote']
        if source not in valid_sources:
            raise ValueError("Source \""+str(source)+"\" is unsupported. Please use one of "+str(valid_sources))

        if source in ['pickle', 'csv', 'glm-pipeline', 'pipeline'] and files is None:
            raise ValueError("Files must be specified for the given source \""+source+"\"")

        if source in ['pickle', 'csv', 'glm-pipeline', 'pipeline'] and not all([os.path.isfile(f) for f in files]):
            paths = "\n".join([os.path.abspath(f) for f in files])
            raise ValueError("At least one of the files specified does not exist.\
                             Here are their absolute paths:\n"+paths)

        if source in ['pickle', 'csv'] and len(files) > 1:
            warn("More than one file given for source \""+source+"\". Only the first one will be used.")

        # Initialize differently based on source.
        # Each if statement creates a GeoDataFrame with the EPSG:4326 CRS

        if source in ['website', 'glm']:
            source = 'glm'
            init_gdf = glm_website()

        elif source == 'usg':
            init_gdf = usg()

        elif source == 'gmn':
            if 'date' not in kwargs:
                raise ValueError('Must specify a yyyy-mm or yyyy-mm-dd date to use GMN data')
            init_gdf = gmn(kwargs['date'], loc_mode='begin')

        elif source == 'pickle':
            with open(files[0], 'rb') as pkl:
                init_gdf = pickle.load(pkl)

        elif source == 'csv':
            init_gdf = csv(files[0])

        elif source == 'remote':
            if 'url' not in kwargs:
                raise ValueError('Must specify a url with url=...')
            url = kwargs['url']
            init_gdf = remote(url)

        elif source == 'usg-orbits':
            init_gdf = remote('https://aozerov.com/data/usg-orbits.csv')
            annotate = False

        # elif source == 'glm-orbits':
        #     init_gdf = csv(file=ROOT_PATH+'/../notebooks/glm-orbits.csv')
        #     annotate = False

        elif source in ['glm-pipeline', 'pipeline']:
            source = 'glm-pipeline'
            if 'min_confidence' in kwargs:
                min_confidence = kwargs['min_confidence']
            else:
                min_confidence = 0
            init_gdf = pipeline(files=files, min_confidence=min_confidence)

        init_gdf['source'] = source

        # rearrange columns, respecting original order if csv or pickle
        if source not in ['csv', 'pickle'] or rearrange is True:
            first_cols = [col for col in _FIRST_COLS if col in init_gdf.columns]
            other_cols = [col for col in init_gdf.columns if col not in first_cols]
            init_gdf = init_gdf[first_cols + other_cols]

        # initialize the super-class (GeoDataFrame) using the created init_gdf
        super().__init__(init_gdf)

        # add additional metadata to the bolides
        if annotate:
            self.annotate()

        descriptions = pd.read_csv(ROOT_PATH+'/metadata/columns.csv', index_col='column')
        self.descriptions = descriptions[descriptions.sources.isin([source, 'all'])]

        
    def annotate(self):
        """Add metadata to bolide detections"""
        from .astro_utils import get_phase, get_solarhour, get_sun_alt

        # lunar phase
        self['phase'] = [get_phase(dt) for dt in self['datetime']]
        # moon fullness
        self['moon_fullness'] = -np.abs(self['phase']-0.5)*2+1
        # solar hour
        self['solarhour'] = [get_solarhour(data[0], data[1]) for data in zip(self['datetime'], self['longitude'])]
        # solar altitude
        sun_alt = np.array([get_sun_alt(dt=row['datetime'].to_pydatetime(), lat=row['latitude'], lon=row['longitude']) for _, row in self.iterrows()])
        self['sun_alt_obs'] = sun_alt[:, 0]
        self['sun_alt_app'] = sun_alt[:, 1]

        self['date_retrieved'] = datetime.now()

    def describe(self, key=None):
        """Describe a variable of the BolideDataFrame

        Parameters
        ----------
        key : str, iterable, or None
            The variables to be described.
            If str, describes that variable.
            If iteralbe, describe all variable strings in the iterable.
            If None, describe all variables in the BolideDataFrame.

        Returns
        -------
        str :
            A string describing the variable(s).
        """

        if type(key) is str:
            key = [key]
        to_describe = self.columns if key is None else key
        if 'source' in self.columns and len(self)>0:
            source = self['source'].iloc[0]
        else:
            source = ''
        sources = ['all', source]
        for column in to_describe:
            description = self.descriptions['description'][column] if column in self.descriptions.index else ""
            print(column+":", description)


[docs]
    def filter_date(self, start=None, end=None, inplace=False):
        """Filter bolides by date.

        Filters the `~BolideDataFrame` using dates given in ISO format.
        `start` or `end` can be left unspecified to only bound the dates on one end.

        Parameters
        ----------
        start, end: str
            ISO-format strings that can be read by `~datetime.datetime.fromisoformat`.
            If the timezone is not specified, it is assumed to be in UTC.
        inplace: bool
            If True, the `~BolideDataFrame` of this method is altered. If False, it is not, so the returned
            `~BolideDataFrame` must be used.

        Returns
        -------
        `~BolideDataFrame`
        """
        new_bdf = self

        # drop data before start date, if specified
        if start is not None:
            dt = datetime.fromisoformat(start)

            # if timezone not given, make the datetime timezone-aware
            # with UTC assumed as the timezone
            if dt.tzname() is None:
                dt = utc.localize(dt)

            to_drop = self.datetime < dt
            new_bdf = new_bdf.drop(new_bdf.index[to_drop], inplace=inplace)
        if inplace:
            new_bdf = self
        # drop data after end date, if specified
        if end is not None:
            dt = datetime.fromisoformat(end)

            # if timezone not given, make the datetime timezone-aware
            # with UTC assumed as the timezone
            if dt.tzname() is None:
                dt = utc.localize(dt)

            to_drop = new_bdf.datetime > dt
            new_bdf = new_bdf.drop(new_bdf.index[to_drop], inplace=inplace)
        if inplace:
            new_bdf = self

        # force the class, which gets converted to GeoDataFrame at some point
        # in the drop methods above
        force_bdf_class(new_bdf)

        return new_bdf



[docs]
    def get_closest_by_time(self, datestr, n=1):
        """Get the n bolides closest to a given iso-format date string

        Parameters
        ----------
        datestr : str
            ISO-format string that can be read by `~datetime.datetime.fromisoformat`.
            If the timezone is not specified, it is assumed to be in UTC.
        n : int
            Number of closest detections to return.

        Returns
        -------
        bdf : `~BolideDataFrame`
            The filtered data.

        """

        dt = datetime.fromisoformat(datestr)

        # if timezone not given, make the datetime timezone-aware
        # with UTC assumed as the timezone
        if dt.tzname() is None:
            dt = utc.localize(dt)

        return self.iloc[(self['datetime'] - dt).abs().argsort()].head(n)



[docs]
    def get_closest_by_loc(self, lon, lat, n=1):
        """Get the n bolides closest to a given longitude and latitude

        Parameters
        ----------
        lon, lat : int
            Longitude and latitude in degrees to search for bolides near.
        n : int
            Number of closest detections to return.

        Returns
        -------
        bdf : `~BolideDataFrame`
            The filtered data.

        """
        distances = haversine(lat, lon, self['latitude'].values, self['longitude'].values)
        return self.iloc[distances.argsort()].head(n)


    def get_closest(self, datestr=None, lon=None, lat=None, k=1):
        """
        Get the n bolides closest to a given iso-format date string and to a given
        longitude and latitude in degrees. This is calculated using BallTree and a
        custom distance metric that finds the expected number of pairs of bolides
        that would be at least as close in space and time as the two points being
        compared, under a random (uniform) distribution. Effectively, given a time
        and location, this will find and cluster bolide events that are nearest to it.

        Parameters
        ----------
        datestr : str
            ISO-format string that can be read by `~datetime.datetime.fromisoformat`.
            If the timezone is not specified, it is assumed to be in UTC.
        lon, lat : int
            Longitude and latitude in degrees to search for bolides near.
        k : int
            Number of closest detections to return.

        Returns
        -------
        bdf : `~BolideDataFrame`
            The filtered data.

        """
        if datestr is None and (lon is not None and lat is not None):
            return self.get_closest_by_loc(lon, lat, n=k)
        elif lon is None and lat is None and datestr is not None:
            return self.get_closest_by_time(datestr, n=k)
        elif lon is None or lat is None:
            raise ValueError("Must specify both longitude and latitude, or a date string, or all three.")
        
        

        dt = datetime.fromisoformat(datestr)
        if dt.tzinfo is None:
            dt = utc.localize(dt)

        query_row = {
            'datetime': dt,
            'latitude': lat,
            'longitude': lon
        }

        # Make a deep copy of self to avoid modifying the original
        bdf = self.copy(deep=True)

        # Prepend the query row
        bdf = pd.concat([pd.DataFrame([query_row]), bdf], ignore_index=True)

        # Compute time_seconds for all rows
        bdf['time_seconds'] = (bdf['datetime'] - bdf['datetime'].min()).dt.total_seconds()

        poly_goes = fu.get_boundary('goes', collection=True, intersection=False, crs=None)
        fov_goes = poly_goes.area / 1e6
        min_time = self['datetime'].min()
        max_time = self['datetime'].max()
        total_time = (max_time - min_time).total_seconds()
        n = len(self)

        metric = partial(_distance_metric, fov_goes=fov_goes, total_time=total_time, n=n)
        
        X = bdf[['time_seconds', 'latitude', 'longitude']].to_numpy()

        tree = BallTree(X, metric=metric)
        
        dists, inds = tree.query(X[:1], k=k+1)
        
        inds = inds[0][1:]  # Exclude the first index (the query point itself)

        cols = list(bdf.columns)
        a, b = cols.index('latitude'), cols.index('longitude')
        cols[b], cols[a] = cols[a], cols[b]
        bdf = bdf[cols]

        return bdf.iloc[inds]


[docs]
    def filter_boundary(self, boundary, intersection=False, interior=True):
        """Filter data to only points within specified boundaries.

        Parameters
        ----------
        boundary: str or list of str
            The boundary (boundaries) to filter the `~BolideDataFrame` by.
            Refer to `~bolides.fov_utils.get_boundary`.
        intersection: bool
            - True: keep detections within the union of all boundaries given.
            - False: keep detections within the intersection of all boundaries given.
        interior: bool
            - True: default behavior.
            - False: keep detections outside the union or intersection of the boundaries, instead of inside.

        Returns
        -------
        `~BolideDataFrame`
            The filtered `~BolideDataFrame`
        """

        import pyproj
        # define Azimuthal Equidistant projection
        aeqd = pyproj.Proj(proj='aeqd', ellps='WGS84', datum='WGS84', lat_0=90, lon_0=0).srs

        # put bdf into the correct CRS
        crs = self.geometry.crs
        bdf = self.to_crs(aeqd)

        from .fov_utils import get_boundary
        final_polygon = get_boundary(boundary, intersection=intersection, collection=False)

        # clip with the polygon, which is in Azimuthal Equidistant
        points_in_poly = [pt.within(final_polygon) for pt in bdf['geometry']]
        if interior:
            bdf = bdf[points_in_poly]
        else:
            bdf = bdf[~points_in_poly]
        # project bdf back to original CRS
        bdf = bdf.to_crs(crs)

        force_bdf_class(bdf)

        return bdf



[docs]
    def filter_observation(self, sensors, intersection=False):
        """Filter data to only points observable (in time and space) by a given sensor.

        Parameters
        ----------
        sensors: str or list of str
            The sensor(s) to filter the `~BolideDataFrame` by:

            - ``'GLM-16'``: The Geostationary Lightning Mapper aboard GOES-16.
            - ``'GLM-17'``: The Geostationary Lightning Mapper aboard GOES-17.
              Note that the biannual yaw flips of GOES-17 are taken into account.
            - ``'GLM-18'``: The Geostationary Lightning Mapper aboard GOES-18.
            - ``'GLM-19'``: The Geostationary Lightning Mapper aboard GOES-19.

            The shorthand ``'G16'`` … ``'G19'`` can also be used.

        intersection: bool
            If True, filter for bolides observable by all sensors given.
            If False, filter for bolides observable by any sensors given.

        Returns
        -------
        `~BolideDataFrame`
            The filtered `~BolideDataFrame`
        """

        # input standardization and validation
        if type(sensors) is str:
            sensors = [sensors]
        if type(sensors) is not list or not all([type(s) is str for s in sensors]):
            raise ValueError("Sensors must be a string or list of strings")
        sensors = [sensor.lower() for sensor in sensors]

        valid_sensors = ['g16', 'g17', 'g18', 'g19', 'glm-16', 'glm-17', 'glm-18', 'glm-19']

        indices = []
        for num, sensor in enumerate(sensors):
            # resolve the sensor to a GOES satellite and load its observation
            # windows + FOVs (shared with fov_utils.get_satellites)
            try:
                satellite = fu._normalize_satellite(sensor)
            except ValueError:
                raise ValueError("Invalid sensor \""+sensor+"\". sensors must be in "+str(valid_sensors))

            bdfs = []

            # read csv defining the observation times and FOV
            fov_df = fu.get_obs_intervals(satellite)

            # for each observation time + FOV, filter_date by the observation
            # times, filter_boundary by the FOV, and append result to the list
            # of BolideDataFrames
            for i in range(len(fov_df)):
                start = None if fov_df.start.isna()[i] else fov_df.start[i]
                end = None if fov_df.end.isna()[i] else fov_df.end[i]
                boundary = fov_df.boundary[i]
                bdfs.append(self.filter_boundary([boundary]).filter_date(start=start, end=end))
            bdf = pd.concat(bdfs)
            if not intersection:
                indices += list(bdf.index)
            elif num == 0:
                indices = list(bdf.index)
            else:
                indices = [idx for idx in indices if idx in list(bdf.index)]

        # get a list of unique indices, and sort them
        indices = list(set(indices))
        indices.sort()

        # get the final filtered BolideDataFrame and return
        filtered = self.loc[indices, :]
        force_bdf_class(filtered)
        return filtered



[docs]
    def filter_shower(self, shower=None, years=None, padding=1, sdf=None, exclude=False):
        """Filter data to only points observable (in time and space) by a given sensor.

        Parameters
        ----------
        shower: str, int, list of str, or list of int
            The meteor shower(s) to filter by. Can enter either IAU number,
            IAU 3-letter code, or full shower name. Refer to the IAU Meteor
            Data center at https://www.ta3.sk/IAUC22DB/MDC2007/.
        years: int or list of int
            The shower years to include. Default is to filter for all occurrences
            of the shower(s) in the BolideDataFrame
        padding: int
            The number of days to pad around the peak time.
        sdf: ShowerDataFrame
            Optionally provide a ShowerDataFrame to use for filtering
        exclude: bool
            Whether or not to exclude bolides around the given showers. Default is to include.

        Returns
        -------
        `~BolideDataFrame`
            The filtered `~BolideDataFrame`
        """
        if years is None:
            years = list(range(min(self.datetime).year-1, max(self.datetime).year+1))
        elif type(years) is int:
            years = [years]

        if sdf is None:
            if hasattr(self, '_showers'):
                sdf = self._showers
            else:
                from . import ShowerDataFrame
                sdf = ShowerDataFrame()
                self._showers = sdf

        dates = sdf.get_dates(shower, years).datetime
        date_padding = timedelta(days=padding)
        date_ranges = [[d-date_padding, d+date_padding] for d in dates]

        counts = np.sum(np.array([list(self.datetime.between(d[0], d[1])) for d in date_ranges]), axis=0)
        if not exclude:
            good_locs = counts != np.zeros(len(counts))
        else:
            good_locs = counts == np.zeros(len(counts))
        return self[good_locs]



[docs]
    def plot_detections(self, category=None, *args, **kwargs):
        """Plot detections of bolides.

        Reprojects the geometry of bdf to the crs given, and scatters the points
        on a cartopy map. kwargs are passed through to matplotlib's scatter.

        Parameters
        ----------
        crs : `~cartopy.crs.CRS`
            The map projection to use. Refer to
            https://scitools.org.uk/cartopy/docs/latest/reference/projections.html.
        boundary : str or list of str
            The boundaries to plot.
            Refer to `~bolides.fov_utils.get_boundary`.
        category : str
            The name of a categorical column in the `~BolideDataFrame`
        **kwargs :
            Keyword arguments passed through to `~matplotlib.pyplot.scatter`.

        Other Parameters
        ----------------
        coastlines: bool
            Whether or not to draw coastlines.
        style : str
            The matplotlib style to use. Refer to
            https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html
        boundary_style : dict
            The kwargs to use when plotting the boundary.
            Refer to `~cartopy.mpl.geoaxes.GeoAxes.add_geometries`.
        figsize : tuple
            The size (width, height) of the plotted figure.

        Returns
        -------
        fig : `~matplotlib.pyplot.figure`
        ax : `~cartopy.mpl.geoaxes.GeoAxesSubplot`
        """

        # filter to only detections containing both latitude and longitude
        bdf = self[(~self.latitude.isnull()) & (~self.longitude.isnull())]

        if category is not None:
            category = self[category]

        from .plotting import plot_scatter
        fig, ax = plot_scatter(bdf.latitude, bdf.longitude, category=category, *args, **kwargs)

        return fig, ax



[docs]
    def plot_interactive(self, *args, **kwargs):
        """Plot an interactive map of bolide detections.

        Parameters
        ----------
        mode : str
            Either ``'earth'`` or ``'radiant'``.
            ``'earth'`` plots locations on the Earth,
            ``'radiant'`` plots locations in the sky.
        projection : str
            The map projection to use. Here is the complete list:
            [``'eckert4'``, ``'goes-e'``, ``'goes-w'``, ``'fy4a'``,
            ``'airy'``, ``'aitoff'``, ``'albers'``, ``'albers usa'``, ``'august'``,
            ``'azimuthal equal area'``, ``'azimuthal equidistant'``, ``'baker'``,
            ``'bertin1953'``, ``'boggs'``, ``'bonne'``, ``'bottomley'``, ``'bromley'``,
            ``'collignon'``, ``'conic conformal'``, ``'conic equal area'``, ``'conic equidistant'``,
            ``'craig'``, ``'craster'``, ``'cylindrical equal area'``,
            ``'cylindrical stereographic'``, ``'eckert1'``, ``'eckert2'``,
            ``'eckert3'``, ``'eckert4'``, ``'eckert5'``, ``'eckert6'``, ``'eisenlohr'``,
            ``'equirectangular'``, ``'fahey'``, ``'foucaut'``, ``'foucaut sinusoidal'``,
            ``'ginzburg4'``, ``'ginzburg5'``, ``'ginzburg6'``,
            ``'ginzburg8'``, ``'ginzburg9'``, ``'gnomonic'``, ``'gringorten'``,
            ``'gringorten quincuncial'``, ``'guyou'``, ``'hammer'``, ``'hill'``,
            ``'homolosine'``, ``'hufnagel'``, ``'hyperelliptical'``,
            ``'kavrayskiy7'``, ``'lagrange'``, ``'larrivee'``, ``'laskowski'``,
            ``'loximuthal'``, ``'mercator'``, ``'miller'``, ``'mollweide'``, ``'mt flat polar parabolic'``,
            ``'mt flat polar quartic'``, ``'mt flat polar sinusoidal'``,
            ``'natural earth'``, ``'natural earth1'``, ``'natural earth2'``,
            ``'nell hammer'``, ``'nicolosi'``, ``'orthographic'``,
            ``'patterson'``, ``'peirce quincuncial'``, ``'polyconic'``,
            ``'rectangular polyconic'``, ``'robinson'``, ``'satellite'``, ``'sinu mollweide'``,
            ``'sinusoidal'``, ``'stereographic'``, ``'times'``,
            ``'transverse mercator'``, ``'van der grinten'``, ``'van der grinten2'``,
            ``'van der grinten3'``, ``'van der grinten4'``,
            ``'wagner4'``, ``'wagner6'``, ``'wiechel'``, ``'winkel tripel'``,
            ``'winkel3'``]
        boundary : str or list of str
            The boundaries to plot.
            Refer to `~bolides.fov_utils.get_boundary`.
        color : str
            The name of a column in the `~BolideDataFrame`
        logscale : bool
            Whether or not to use a logarithmic scale for the colors.
        **kwargs :
            Keyword arguments passed through to `~plotly.express.scatter_geo`.

        Other Parameters
        ----------------
        coastlines: bool
            Whether or not to draw coastlines.
        style : str
            The matplotlib style to use. Refer to
            https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html
        boundary_style : dict
            The kwargs to use when plotting the boundary.
            Refer to `~cartopy.mpl.geoaxes.GeoAxes.add_geometries`.
        figsize : tuple
            The size (width, height) of the plotted figure.
        culture : str
            When mode is ``'radiant'``, the asterism source culture to use. Here is the complete list:
            [``'anutan'``, ``'aztec'``, ``'belarusian'``, ``'boorong'``,
             ``'chinese'``, ``'chinese_contemporary'``, ``'chinese_medieval'``,
             ``'egyptian'``, ``'hawaiian_starlines'``, ``'indian'``,
             ``'inuit'``, ``'japanese_moon_stations'``, ``'korean'``,
             ``'lokono'``, ``'macedonian'``, ``'maori'``, ``'maya'``,
             ``'mongolian'``, ``'navajo'``, ``'norse'``, ``'northern_andes'``,
             ``'romanian'``, ``'russian_siberian'``, ``'sami'``,
             ``'sardinian'``, ``'tongan'``, ``'tukano'``, ``'tupi'``,
             ``'western'``, ``'western_SnT'``, ``'western_hlad'``,
             ``'western_rey'``]


        Returns
        -------
        fig : `~matplotlib.pyplot.figure`
        ax : `~cartopy.mpl.geoaxes.GeoAxesSubplot`
        """
        from .plotting import plot_interactive
        return plot_interactive(self, *args, **kwargs)



[docs]
    def plot_density(self, *args, **kwargs):
        """Plot bolide detection density.

        Density is computed using scikit-learn's `~sklearn.neighbors.KernelDensity`
        using the haversine distance metric (as the data is in longitude and latitude)
        and gaussian kernel by default.
        It is then gridded, projected, and plotted.

        Parameters
        ----------
        crs : `~cartopy.crs.CRS`
            The map projection to use. Refer to
            https://scitools.org.uk/cartopy/docs/latest/reference/projections.html.
        bandwidth : float
            The bandwidth of the Kernel Density Estimator, in degrees.
        boundary : str or list of str
            The boundaries to plot and clip the density by.
            Refer to `~bolides.fov_utils.get_boundary`.
        n_levels : int
            Number of discrete density levels to plot.
        lat_resolution, lon_resolution : ints
            The number of discrete latitude and longitude levels when gridding the density.
        **kwargs :
            Keyword arguments passed through to `~cartopy.mpl.geoaxes.GeoAxes.contourf`.

        Other Parameters
        ----------------
        coastlines : bool
            Whether or not to draw coastlines.
        style : str
            The matplotlib style to use. Refer to
            https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html
        boundary_style : dict
            The kwargs to use when plotting the boundary.
            Refer to `~cartopy.mpl.geoaxes.GeoAxes.add_geometries`.
        kde_params : dict
            The kwargs to pass to `~sklearn.neighbors.KernelDensity`.
            Note that 'metric' is not allowed to be specified, as haversine
            is the only valid metric.
        figsize : tuple
            The size (width, height) of the plotted figure.

        Returns
        -------
        fig : `~matplotlib.pyplot.figure`
        ax : `~cartopy.mpl.geoaxes.GeoAxesSubplot`
        """

        # filter to only detections containing both latitude and longitude
        bdf = self[(~self.latitude.isnull()) & (~self.longitude.isnull())]

        from .plotting import plot_density
        fig, ax = plot_density(bdf.latitude, bdf.longitude, *args, **kwargs)

        return fig, ax



[docs]
    def plot_dates(self, freq='1D', logscale=False,
                   start=None, end=None,
                   figsize=(10, 3), style=MPLSTYLE,
                   showers=None, line_style={}, **kwargs):
        """Plot the number of bolides over time.

        Parameters
        ----------
        freq : str
            The binning frequency. Can be strings like ``'2D'``, ``'1M'``, etc.
            Refer to https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
        start, end: strs
            Optional arguments to filter by date.
            Must be ISO-format strings that can be read by `~datetime.datetime.fromisoformat`.
        logscale : bool
            Whether or not to use a log-scale for the y-axis
        **kwargs :
            Keyword arguments passed through to `~matplotlib.axes.Axes.bar`.

        Other Parameters
        ----------------
        style : str
            The matplotlib style to use. Refer to
            https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html
        showers : ShowerDataFrame
            A ShowerDataFrame to use for the shower data. By default, showers are pulled
            from the established showers list at the IAU meter data center.
        figsize : tuple
            The size (width, height) of the plotted figure.
        line_style : dict
            The matplotlib style arguments for the vertical shower lines.

        Returns
        -------
        fig : `~matplotlib.pyplot.figure`
        ax : `~matplotlib.axes.Axes`
        """
        from pandas.plotting import register_matplotlib_converters
        register_matplotlib_converters()

        # filter date to given start and end times
        bdf = self.filter_date(start, end, inplace=False)
        datetimes = bdf.datetime

        # get counts according to the given frequency
        index = bdf.groupby(pd.Grouper(key='datetime', freq=freq)).count().index

        if showers is not None:

            # a ShowerDataFrame is stored in the _showers attribute, so that
            # users can make multiple plots without constantly recreating it
            if hasattr(self, '_showers'):
                sdf = self._showers
            else:
                from . import ShowerDataFrame
                sdf = ShowerDataFrame()
                self._showers = sdf
            date_info = sdf.get_dates(showers, range(min(bdf.datetime).year-1, max(bdf.datetime).year+1))
            shower_names = date_info['shower name']
            shower_dates = date_info.datetime

        # with the given matplotlib style,
        with plt.style.context(style):

            # initialize figure and set major, minor ticks and formats
            fig, ax = plt.subplots(figsize=figsize, dpi=300)
            ax.xaxis.set_major_locator(mdates.YearLocator())
            ax.xaxis.set_minor_locator(mdates.MonthLocator())
            ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
            ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))

            # make minor ticks (months) small, pad major ticks (years)
            plt.tick_params(axis='x', which='minor', labelsize=5, pad=0, labeltop=True)
            plt.tick_params(axis='x', which='major', pad=3, labeltop=True)

            # set x-label, adding year data if data is only within one year
            # (and hence no year data on x-axis)
            ax.set_xlabel("Date")
            if min(bdf.datetime).year == max(bdf.datetime).year:
                ax.set_xlabel("Date (month in "+str(min(bdf.datetime).year)+")")
            ax.set_ylabel("# Events")

            plt.hist(datetimes, bins=index, **kwargs)

            # set x-limits to data limits
            x_min = datetime.fromisoformat(start) if start is not None else min(bdf.datetime)
            x_max = datetime.fromisoformat(end) if end is not None else max(bdf.datetime)
            plt.xlim(x_min, x_max)

            if showers is not None:
                names = list(shower_names)
                unique = list(shower_names.unique())
                import matplotlib.cm as cm
                cmap = cm.Set2

                defaults = {'linestyle': 'dashed', 'linewidth': 1,
                            'alpha': 0.5}
                line_kwargs = reconcile_input(line_style, defaults)

                for name, date in zip(shower_names, shower_dates):
                    plt.axvline(x=date, color=cmap(unique.index(name)/8),
                                label=name, **line_kwargs)
                # remove extra labels for repeat showers
                for i, p in enumerate(ax.get_lines()):
                    if p.get_label() in names[:i]:
                        p.set_label('_' + p.get_label())
                plt.legend(loc='upper right')

            # make y-axis logscale if specified
            if logscale:
                ax.set_yscale('log')

        return fig, ax



[docs]
    def add_website_data(self, ids=None):
        """Pull light curve and integrated energy data from neo-bolide.ndc.nasa.gov.

        Downloads light curve data from neo-bolide.ndc.nasa.gov, placing it
        into the BolideDataFrame as a column of `~lightkurve.LightCurveCollection` objects.
        The column is named ``'lightcurves'``. Integrated energy data is also downloaded
        for different satellites and placed into columns starting with ``'energy_'``.
        This method will only work with BolideDataFrames having an ``_id`` column containing
        IDs from GLM bolide detections at neo-bolide.ndc.nasa.gov.

        Parameters
        ----------
        ids : list of str
            Optional list of strings representing the bolide ID's that additional data is
            needed for. If not used, data is added for every bolide in the BolideDataFrame.
        """

        # if there is no _id column, we can't associate the website data to
        # the bolides in the BolideDataFrame
        assert '_id' in self.columns, "BolideDataFrame must have an '_id' column"

        # check that ids is not a single string
        assert type(ids) is not str, "Input must be must be a list"

        # make ids a List if it is an iterable
        if hasattr(ids, '__iter__'):
            ids = list(ids)

            # check that all the given ids actually exist
            assert all([_id in list(self._id) for _id in ids]), "All given IDs must exist in the BolideDataFrame"

        # if no ids are given, we assume that we will obtain website data for each
        # event in the BolideDataFrame
        if ids == None:
            ids = list(self._id)

        # at this point, unless the input was not iterable, ids should be a list
        assert type(ids) is list, "Input must be a list"

        # get the data from the website by passing the ids
        from .sources import glm_website_event
        data = glm_website_event(ids)

        # initialize empty columns in the BolideDataFrame
        cols = data.keys()
        for col in cols:
            if col == 'lightcurves':
                self[col] = None
            else:
                self[col] = np.nan

        # iterate through the list of ids, plugging the obtained data
        # into the appropriate rows in the BolideDataFrame
        for i, _id in enumerate(ids):

            # find the row containing the id
            idx = self.index[np.argmax(self._id == _id)]

            # enter the data
            for col in cols:
                self.loc[idx, col] = data[col][i]


    # TODO: match on a column other than _id?

[docs]
    def augment(self, new_data, time_limit=300, score_limit=5, intersection=False, outer=False):
        """Augment BolideDataFrame with data from another source

        Parameters
        ----------
        new_data : `~BolideDataFrame`
            The source of the new data.
        time_limit : int
            Maximum time difference (seconds) for two bolides to possibly be the same
        score_limit : int
            Maximum score (time_limit * difference in latlon) for two bolides to
            possibly be the same.
        intersection : bool
            - True: only keep detections that appear in both sets of data.
            - False: keep all detections that appear in the first set.
        outer : bool
            - True: keep detections from both sets of data.
            - False: keep all detections that appear in the first set.

        Returns
        -------
        bdf : `~BolideDataFrame`
        """

        # make _id column if doesn't exist
        if "_id" not in self.columns:
            self["_id"] = np.arange(len(self))

        # iterate through rows of self, computing distance and identifying
        # detections with detections in the other data if certain closeness
        # criteria based on time and distance are met
        ids = [""] * len(new_data)
        for num, row in tqdm(self.iterrows(), "Augmenting data", total=len(self)):
            deltas = row['datetime']-new_data['datetime']
            s_deltas = np.abs([delta.total_seconds() for delta in deltas])
            closest = np.argmin(s_deltas)
            if s_deltas[closest] < time_limit:
                lat_diff = abs(row['latitude']-new_data['latitude'].iloc[closest])
                lon_diff = abs(row['longitude'] % 360 - new_data['longitude'].iloc[closest] % 360)
                geo_diff = lat_diff+lon_diff
                s_diff = s_deltas[closest]
                score = geo_diff * s_diff
                if score < score_limit:
                    ids[closest] = row['_id']
        new_data['_id'] = ids
        # merge the data. If taking the intersection, only keep rows in
        # original BolideDataFrame that have a corresponding detection in
        # the other source
        if intersection:
            merged = self.merge(new_data, 'inner', on='_id')
        elif outer:
            merged = self.merge(new_data, 'outer', on='_id')
        else:
            merged = self.merge(new_data, 'left', on='_id')

        # rename the required columns
        for col in merged.columns:
            if col.endswith('_x') and col[:-2]+"_y" in merged.columns:
                merged[col[:-2]] = merged[col]
                del merged[col]

        # force class
        merged.__class__ = BolideDataFrame

        return merged


    def __setattr__(self, attr, val):
        # since BolideDataFrame can have non-column attributes, we can ignore
        # Pandas' warnings about setting attributes
        filterwarnings("ignore", message="Pandas doesn't allow columns to be created via a new attribute name")
        return super().__setattr__(attr, val)

    # override GeoPandas' __getitem__ to force the class to remain BolideDataFrame
    def __getitem__(self, key):
        result = super().__getitem__(key)
        force_bdf_class(result)
        return result

    @property
    def _constructor(self):
        return BolideDataFrame

    # define how the BolideDataFrame renders in an IPython notebook
    def _repr_html_(self):

        # want to show all columns
        with pd.option_context('display.max_columns', None):
            # get the default representation from Pandas
            df_rep = super()._repr_html_()

        good_sources = ['glm', 'usg', 'glm-pipeline']

        attribution = ""

        # get the attribution HTML file
        if 'source' in self.columns and len(self) > 0:
            source = self['source'].iloc[0]
            if source in good_sources:
                with open(ROOT_PATH + '/metadata/' + source + '.html', 'r') as f:
                    attribution = f.read()
            if 'source_y' in self.columns and self['source_y'].iloc[0] in good_sources:
                source = self['source_y'].iloc[0]
                attribution += "Augmented with "
                with open(ROOT_PATH + '/metadata/' + source + '.html', 'r') as f:
                    attribution += f.read()
        return attribution + df_rep



def force_bdf_class(bdf):
    if isinstance(bdf, GeoDataFrame):
        bdf.__class__ = BolideDataFrame