Correlating two GeoSeries

Correlating two GeoSeries#

This notebook performs a small analysis: computing correlation between two GeoSeries.

import json
import requests
import pandas as pd
import io
import ast
import pyleoclim as pyleo
url = 'https://linkedearth.graphdb.mint.isi.edu/repositories/LiPDVerse-dynamic' # version this to avoid breaking changes (5?)
url_d = {'{le_var}':'http://linked.earth/ontology/paleo_variables#','{wgs84}':'http://www.w3.org/2003/01/geo/wgs84_pos#','{le}':'http://linked.earth/ontology#','{rdfs}':'http://www.w3.org/2000/01/rdf-schema#'}
query ="""
PREFIX le: <{le}>
PREFIX le_var: <{le_var}>
PREFIX wgs84: <{wgs84}>
PREFIX rdfs: <{rdfs}>

SELECT DISTINCT 
    ?dataSetName ?archiveType ?geo_meanLat ?geo_meanLon ?geo_meanElev 
    ?paleoData_variableName  ?resolution ?paleoData_units ?paleoData_values ?time_values 
    ?paleoData_proxy ?paleoData_proxyGeneral 
    ?time_variableName  ?time_units ?maxTime ?minTime
    ?compilationName ?TSID 
    ?startDate ?endDate 
WHERE {

    ?ds a le:Dataset .
    ?ds le:hasName ?dataSetName .

    OPTIONAL {
        ?ds le:hasArchiveType ?archiveTypeObj .
        ?archiveTypeObj rdfs:label ?archiveType .
    }

    ?ds le:hasLocation ?loc .
    OPTIONAL {?loc wgs84:lat ?geo_meanLat .}
    OPTIONAL {?loc wgs84:long ?geo_meanLon .}      
    OPTIONAL {?loc wgs84:alt ?geo_meanElev .}

    ?ds le:hasPaleoData ?data .
    ?data le:hasMeasurementTable ?table .
    ?table le:hasVariable ?var .

    ?var le:hasName ?paleoData_variableName .
    ?var le:hasValues ?paleoData_values .
    
    OPTIONAL {
        ?var le:hasUnits ?paleoData_unitsObj .
        ?paleoData_unitsObj rdfs:label ?paleoData_units .
    }
    OPTIONAL {
        ?var le:hasProxy ?paleoData_proxyObj .
        ?paleoData_proxyObj rdfs:label ?paleoData_proxy .
    }
    OPTIONAL {
        ?var le:hasProxyGeneral ?paleoData_proxyGeneralObj .
        ?paleoData_proxyGeneralObj rdfs:label ?paleoData_proxyGeneral .
    }

    ?var le:partOfCompilation ?compilation .
    ?compilation le:hasName ?compilationName .
    VALUES ?compilationName {"Pages2kTemperature"} .

    ?var le:useInGlobalTemperatureAnalysis True .

    OPTIONAL { ?var le:hasVariableId ?TSID }

    ?table le:hasVariable ?timevar .
    ?timevar le:hasName ?time_variableName .
    ?timevar le:hasStandardVariable le_var:year .
    ?timevar le:hasValues ?time_values .
    ?timevar le:hasMinValue ?minTime .
    ?timevar le:hasMaxValue ?maxTime .
    OPTIONAL{
        ?timevar le:hasUnits ?time_unitsObj .
        ?time_unitsObj rdfs:label ?time_units .
    }


}
"""
def update_urls(query, url_d):
    for key, value in url_d.items():
        query = query.replace(key, value)
    return query
query = update_urls(query, url_d)
response = requests.post(url, data = {'query': query})

data = io.StringIO(response.text)
df = pd.read_csv(data, sep=",")
df.head()
dataSetName archiveType geo_meanLat geo_meanLon geo_meanElev paleoData_variableName resolution paleoData_units paleoData_values time_values paleoData_proxy paleoData_proxyGeneral time_variableName time_units maxTime minTime compilationName TSID startDate endDate
0 Ant-WDC05A.Steig.2013 Glacier ice -79.46 -112.09 1806.0 d18O NaN permil [-33.32873325, -35.6732, -33.1574, -34.2854, -... [2005, 2004, 2003, 2002, 2001, 2000, 1999, 199... d18O NaN year yr AD 2005.0 786.0 Pages2kTemperature Ant_030 NaN NaN
1 NAm-MtLemon.Briffa.2002 Wood 32.50 -110.80 2700.0 MXD NaN NaN [0.968, 0.962, 1.013, 0.95, 1.008, 0.952, 1.02... [1568, 1569, 1570, 1571, 1572, 1573, 1574, 157... maximum latewood density NaN year yr AD 1983.0 1568.0 Pages2kTemperature NAm_568 NaN NaN
2 Arc-Arjeplog.Bjorklund.2014 Wood 66.30 18.20 800.0 density NaN NaN [-0.829089212152348, -0.733882889924006, -0.89... [1200, 1201, 1202, 1203, 1204, 1205, 1206, 120... maximum latewood density NaN year yr AD 2010.0 1200.0 Pages2kTemperature Arc_060 NaN NaN
3 Asi-CHIN019.Li.2010 Wood 29.15 99.93 2150.0 trsgi NaN NaN [1.465, 1.327, 1.202, 0.757, 1.094, 1.006, 1.2... [1509, 1510, 1511, 1512, 1513, 1514, 1515, 151... ring width NaN year yr AD 2006.0 1509.0 Pages2kTemperature Asia_041 NaN NaN
4 NAm-Landslide.Luckman.2006 Wood 60.20 -138.50 800.0 trsgi NaN NaN [1.123, 0.841, 0.863, 1.209, 1.139, 1.056, 0.8... [913, 914, 915, 916, 917, 918, 919, 920, 921, ... ring width NaN year yr AD 2001.0 913.0 Pages2kTemperature NAm_1876 NaN NaN
df['paleoData_values']=df['paleoData_values'].apply(lambda row : json.loads(row) if isinstance(row, str) else row)
df['time_values']=df['time_values'].apply(lambda row : json.loads(row) if isinstance(row, str) else row)
import matplotlib.pyplot as plt

def plot_time_coverage(
    df,
    label_col: str = "dataSetName",
    time_col: str = "time_values",
    figsize=(8, None)
):
    """
    Plot a horizontal bar for each row in df showing its time coverage.
    
    Parameters
    ----------
    df : pandas.DataFrame
        Must contain:
          - df[label_col]: categorical labels
          - df[time_col]: iterable of numeric times
    label_col : str
        Name of the column with category labels.
    time_col : str
        Name of the column with sequence of time values.
    figsize : tuple
        Figure size; if height is None it will be computed from number of rows.
    """
    # Compute starts and ends
    starts = df[time_col].apply(min)
    ends   = df[time_col].apply(max)
    durations = ends - starts

    n = len(df)
    height = figsize[1] or max(1, 0.15 * n)
    fig, ax = plt.subplots(figsize=(figsize[0], height))

    y_pos = range(n)
    ax.barh(y=y_pos, width=durations, left=starts)

    ax.set_yticks(y_pos)
    ax.set_yticklabels(df[label_col])
    ax.set_xlabel("Time")
    ax.set_ylabel(label_col)
    ax.set_ylim(min(y_pos) - 0.5, max(y_pos) + 0.5)
    ax.invert_yaxis()  # so the first row is at the top

    plt.tight_layout()
    return fig, ax
fig, ax = plot_time_coverage(df, label_col='dataSetName', time_col='time_values')
../../_images/1c5e979ed174f329b80e9b40e12cc5cb96b2a9c8829bd30f2c18a3b46ce893a0.png
ts_list = []
for _, row in df.iterrows():
    geo_series = pyleo.GeoSeries(time=row['time_values'],value=row['paleoData_values'],
                            time_name=row['time_variableName'],value_name=row['paleoData_variableName'],
                            time_unit=row['time_units'], value_unit=row['paleoData_units'],
                            lat = row['geo_meanLat'], lon = row['geo_meanLon'], 
                            elevation = row['geo_meanElev'],   
                            archiveType = row['archiveType'],
                            observationType=row['paleoData_proxy'],       
                            label=row['dataSetName']+'_'+row['paleoData_proxy'], verbose = False)
    ts_list.append(geo_series)
fig, ax = pyleo.MultipleGeoSeries(ts_list).time_coverage_plot(label_y_offset=-.08) #Fiddling with label offsets is sometimes necessary for aesthetic

../../_images/b1387327cba6afe226c70517ed57771a36d8007c6cc861c50172d35537a78a0f.png
small_set = [ts_list[0], ts_list[4]]
ms = pyleo.MultipleGeoSeries(small_set)
ms.stackplot();
../../_images/f8177234626fe1fc71fec82f507ee2f62d1aafb45e7f233a0d5958f837fbca21.png
bounds =[max([gs.time.min() for gs in small_set]), 
         min([gs.time.max() for gs in small_set])]
gs1 = ts_list[0].sel(time=slice(bounds[0], bounds[-1]))
# gs1 = gs1.convert_time_unit('ky BP')
gs1.plot()
(<Figure size 1000x400 with 1 Axes>,
 <Axes: xlabel='Time [yr AD]', ylabel='d18O [permil]'>)
../../_images/f9abf5971d345d63cbb1aa24fa3e3ac605070a30dda9deaf7e71180d08d70ca8.png
gs2 = ts_list[4].sel(time=slice(bounds[0], bounds[-1]))
# gs2 = gs2.convert_time_unit('ky BP')
gs2.plot()
(<Figure size 1000x400 with 1 Axes>,
 <Axes: xlabel='Time [yr AD]', ylabel='trsgi [nan]'>)
../../_images/e111717e3aa61a708a3ea969235605aed2e08e9143b4acff6cdea4128339017b.png
msc = pyleo.MultipleGeoSeries([gs1, gs2]).common_time()
nEns = 1000
n = len(msc.series_list[0].time)
slist = []
for _ in range(nEns):
    pert = pyleo.utils.tsmodel.random_time_axis(n,
                                         delta_t_dist='random_choice',
                                         param =[[-0.1,0,0.1],[0.02,0.96,0.02]])
    ts = msc.series_list[0].copy()
    ts.time = msc.series_list[0].time + pert
    slist.append(ts)
gs1e = pyleo.EnsembleSeries(slist)
gs1e.common_time().plot_envelope(figsize=(16,6))
(<Figure size 1600x600 with 1 Axes>,
 <Axes: xlabel='Time [yr AD]', ylabel='d18O [permil]'>)
../../_images/160b47639455eb82a3ed8646236c44f59698330f6876e2331eeb08e72fea80b9.png
nEns = 1000
n = len(msc.series_list[1].time)
slist = []
for _ in range(nEns):
    pert = pyleo.utils.tsmodel.random_time_axis(n,
                                         delta_t_dist='random_choice',
                                         param =[[-0.1,0,0.1],[0.02,0.96,0.02]])
    ts = msc.series_list[1].copy()
    ts.time = msc.series_list[1].time + pert
    slist.append(ts)
gs2e = pyleo.EnsembleSeries(slist)
gs2e.common_time().plot_envelope(figsize=(16,6))
(<Figure size 1600x600 with 1 Axes>,
 <Axes: xlabel='Time [yr AD]', ylabel='trsgi [nan]'>)
../../_images/d72c72cc1312eaf09d2c8090e7235ed28863dd89536918c8c2cf8da1f9fba53c.png
gs2e_m = gs2e.common_time().quantiles(qs=[0.5]).series_list[0] # this turns it into a Series
gs1e_m = gs1e.common_time().quantiles(qs=[0.5]).series_list[0] # this turns it into a Series
for method in ['built-in','ar1sim','phaseran',]:
    corr_m = gs2e_m.correlation(gs1e_m,method=method)
    print(corr_m)
  correlation    p-value  signif. (α: 0.05)
-------------  ---------  -------------------
   -0.0638035       0.04  True
Evaluating association on surrogate pairs: 100%|██████████| 1000/1000 [00:00<00:00, 14077.96it/s]
  correlation    p-value  signif. (α: 0.05)
-------------  ---------  -------------------
   -0.0638035       0.14  False
Evaluating association on surrogate pairs: 100%|██████████| 1000/1000 [00:00<00:00, 14637.66it/s]
  correlation    p-value  signif. (α: 0.05)
-------------  ---------  -------------------
   -0.0638035       0.14  False

corr_e = gs2e.correlation(gs1e, method='phaseran',number=1000)
corr_e.plot()
Looping over 1000 Series in the ensemble
100%|██████████| 1000/1000 [02:36<00:00,  6.40it/s]
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[62], line 2
      1 corr_e = gs2e.correlation(gs1e, method='phaseran',number=1000)
----> 2 corr_e.plot()

File ~/PycharmProjects/Pyleoclim_util/pyleoclim/core/correns.py:205, in CorrEns.plot(self, figsize, title, ax, savefig_settings, hist_kwargs, title_kwargs, xlim, alpha, multiple, clr_insignif, clr_signif, clr_signif_fdr, clr_percentile)
    199 r_signif_fdr = np.array(self.r)[self.signif_fdr]
    200 #r_stack = [r_insignif, r_signif, r_signif_fdr]
    201 #ax.hist(r_stack, stacked=True, **args)
    202 
    203 # put everything into a dataframe to be able to use seaborn
--> 205 data = np.empty((len(self.r),3)); data[:] = np.NaN
    206 col  = [f'p < {self.alpha} (w/ FDR)',f'p < {self.alpha} (w/o FDR)', f'p ≥ {self.alpha}']
    207 data[self.signif_fdr,0] = r_signif_fdr        

File ~/miniconda3/envs/hol_ccm_local_env/lib/python3.11/site-packages/numpy/__init__.py:400, in __getattr__(attr)
    397     raise AttributeError(__former_attrs__[attr], name=None)
    399 if attr in __expired_attributes__:
--> 400     raise AttributeError(
    401         f"`np.{attr}` was removed in the NumPy 2.0 release. "
    402         f"{__expired_attributes__[attr]}",
    403         name=None
    404     )
    406 if attr == "chararray":
    407     warnings.warn(
    408         "`np.chararray` is deprecated and will be removed from "
    409         "the main namespace in the future. Use an array with a string "
    410         "or bytes dtype instead.", DeprecationWarning, stacklevel=2)

AttributeError: `np.NaN` was removed in the NumPy 2.0 release. Use `np.nan` instead.
../../_images/28d89d8f1e69432821cf8c034a416e4346ac3bc284a7a32f7b43395fa4c58998.png