Correlating two GeoSeries

Correlating two GeoSeries#

This notebook performs a small analysis: computing correlation between two GeoSeries.

import json
import requests
import pandas as pd
import io
import ast
import pyleoclim as pyleo

url = 'https://linkedearth.graphdb.mint.isi.edu/repositories/LiPDVerse-dynamic' # version this to avoid breaking changes (5?)
url_d = {'{le_var}':'http://linked.earth/ontology/paleo_variables#','{wgs84}':'http://www.w3.org/2003/01/geo/wgs84_pos#','{le}':'http://linked.earth/ontology#','{rdfs}':'http://www.w3.org/2000/01/rdf-schema#'}

query ="""
PREFIX le: <{le}>
PREFIX le_var: <{le_var}>
PREFIX wgs84: <{wgs84}>
PREFIX rdfs: <{rdfs}>

SELECT DISTINCT 
    ?dataSetName ?archiveType ?geo_meanLat ?geo_meanLon ?geo_meanElev 
    ?paleoData_variableName  ?resolution ?paleoData_units ?paleoData_values ?time_values 
    ?paleoData_proxy ?paleoData_proxyGeneral 
    ?time_variableName  ?time_units ?maxTime ?minTime
    ?compilationName ?TSID 
    ?startDate ?endDate 
WHERE {

    ?ds a le:Dataset .
    ?ds le:hasName ?dataSetName .

    OPTIONAL {
        ?ds le:hasArchiveType ?archiveTypeObj .
        ?archiveTypeObj rdfs:label ?archiveType .
    }

    ?ds le:hasLocation ?loc .
    OPTIONAL {?loc wgs84:lat ?geo_meanLat .}
    OPTIONAL {?loc wgs84:long ?geo_meanLon .}      
    OPTIONAL {?loc wgs84:alt ?geo_meanElev .}

    ?ds le:hasPaleoData ?data .
    ?data le:hasMeasurementTable ?table .
    ?table le:hasVariable ?var .

    ?var le:hasName ?paleoData_variableName .
    ?var le:hasValues ?paleoData_values .
    
    OPTIONAL {
        ?var le:hasUnits ?paleoData_unitsObj .
        ?paleoData_unitsObj rdfs:label ?paleoData_units .
    }
    OPTIONAL {
        ?var le:hasProxy ?paleoData_proxyObj .
        ?paleoData_proxyObj rdfs:label ?paleoData_proxy .
    }
    OPTIONAL {
        ?var le:hasProxyGeneral ?paleoData_proxyGeneralObj .
        ?paleoData_proxyGeneralObj rdfs:label ?paleoData_proxyGeneral .
    }

    ?var le:partOfCompilation ?compilation .
    ?compilation le:hasName ?compilationName .
    VALUES ?compilationName {"Pages2kTemperature"} .

    ?var le:useInGlobalTemperatureAnalysis True .

    OPTIONAL { ?var le:hasVariableId ?TSID }

    ?table le:hasVariable ?timevar .
    ?timevar le:hasName ?time_variableName .
    ?timevar le:hasStandardVariable le_var:year .
    ?timevar le:hasValues ?time_values .
    ?timevar le:hasMinValue ?minTime .
    ?timevar le:hasMaxValue ?maxTime .
    OPTIONAL{
        ?timevar le:hasUnits ?time_unitsObj .
        ?time_unitsObj rdfs:label ?time_units .
    }


}
"""

def update_urls(query, url_d):
    for key, value in url_d.items():
        query = query.replace(key, value)
    return query

query = update_urls(query, url_d)

response = requests.post(url, data = {'query': query})

data = io.StringIO(response.text)
df = pd.read_csv(data, sep=",")

df.head()

	dataSetName	archiveType	geo_meanLat	geo_meanLon	geo_meanElev	paleoData_variableName	resolution	paleoData_units	paleoData_values	time_values	paleoData_proxy	paleoData_proxyGeneral	time_variableName	time_units	maxTime	minTime	compilationName	TSID	startDate	endDate
0	Ant-WDC05A.Steig.2013	Glacier ice	-79.46	-112.09	1806.0	d18O	NaN	permil	[-33.32873325, -35.6732, -33.1574, -34.2854, -...	[2005, 2004, 2003, 2002, 2001, 2000, 1999, 199...	d18O	NaN	year	yr AD	2005.0	786.0	Pages2kTemperature	Ant_030	NaN	NaN
1	NAm-MtLemon.Briffa.2002	Wood	32.50	-110.80	2700.0	MXD	NaN	NaN	[0.968, 0.962, 1.013, 0.95, 1.008, 0.952, 1.02...	[1568, 1569, 1570, 1571, 1572, 1573, 1574, 157...	maximum latewood density	NaN	year	yr AD	1983.0	1568.0	Pages2kTemperature	NAm_568	NaN	NaN
2	Arc-Arjeplog.Bjorklund.2014	Wood	66.30	18.20	800.0	density	NaN	NaN	[-0.829089212152348, -0.733882889924006, -0.89...	[1200, 1201, 1202, 1203, 1204, 1205, 1206, 120...	maximum latewood density	NaN	year	yr AD	2010.0	1200.0	Pages2kTemperature	Arc_060	NaN	NaN
3	Asi-CHIN019.Li.2010	Wood	29.15	99.93	2150.0	trsgi	NaN	NaN	[1.465, 1.327, 1.202, 0.757, 1.094, 1.006, 1.2...	[1509, 1510, 1511, 1512, 1513, 1514, 1515, 151...	ring width	NaN	year	yr AD	2006.0	1509.0	Pages2kTemperature	Asia_041	NaN	NaN
4	NAm-Landslide.Luckman.2006	Wood	60.20	-138.50	800.0	trsgi	NaN	NaN	[1.123, 0.841, 0.863, 1.209, 1.139, 1.056, 0.8...	[913, 914, 915, 916, 917, 918, 919, 920, 921, ...	ring width	NaN	year	yr AD	2001.0	913.0	Pages2kTemperature	NAm_1876	NaN	NaN

df['paleoData_values']=df['paleoData_values'].apply(lambda row : json.loads(row) if isinstance(row, str) else row)
df['time_values']=df['time_values'].apply(lambda row : json.loads(row) if isinstance(row, str) else row)

import matplotlib.pyplot as plt

def plot_time_coverage(
    df,
    label_col: str = "dataSetName",
    time_col: str = "time_values",
    figsize=(8, None)
):
    """
    Plot a horizontal bar for each row in df showing its time coverage.
    
    Parameters
    ----------
    df : pandas.DataFrame
        Must contain:
          - df[label_col]: categorical labels
          - df[time_col]: iterable of numeric times
    label_col : str
        Name of the column with category labels.
    time_col : str
        Name of the column with sequence of time values.
    figsize : tuple
        Figure size; if height is None it will be computed from number of rows.
    """
    # Compute starts and ends
    starts = df[time_col].apply(min)
    ends   = df[time_col].apply(max)
    durations = ends - starts

    n = len(df)
    height = figsize[1] or max(1, 0.15 * n)
    fig, ax = plt.subplots(figsize=(figsize[0], height))

    y_pos = range(n)
    ax.barh(y=y_pos, width=durations, left=starts)

    ax.set_yticks(y_pos)
    ax.set_yticklabels(df[label_col])
    ax.set_xlabel("Time")
    ax.set_ylabel(label_col)
    ax.set_ylim(min(y_pos) - 0.5, max(y_pos) + 0.5)
    ax.invert_yaxis()  # so the first row is at the top

    plt.tight_layout()
    return fig, ax

fig, ax = plot_time_coverage(df, label_col='dataSetName', time_col='time_values')

../../_images/1c5e979ed174f329b80e9b40e12cc5cb96b2a9c8829bd30f2c18a3b46ce893a0.png

ts_list = []
for _, row in df.iterrows():
    geo_series = pyleo.GeoSeries(time=row['time_values'],value=row['paleoData_values'],
                            time_name=row['time_variableName'],value_name=row['paleoData_variableName'],
                            time_unit=row['time_units'], value_unit=row['paleoData_units'],
                            lat = row['geo_meanLat'], lon = row['geo_meanLon'], 
                            elevation = row['geo_meanElev'],   
                            archiveType = row['archiveType'],
                            observationType=row['paleoData_proxy'],       
                            label=row['dataSetName']+'_'+row['paleoData_proxy'], verbose = False)
    ts_list.append(geo_series)

fig, ax = pyleo.MultipleGeoSeries(ts_list).time_coverage_plot(label_y_offset=-.08) #Fiddling with label offsets is sometimes necessary for aesthetic

../../_images/b1387327cba6afe226c70517ed57771a36d8007c6cc861c50172d35537a78a0f.png

small_set = [ts_list[0], ts_list[4]]

ms = pyleo.MultipleGeoSeries(small_set)
ms.stackplot();

../../_images/f8177234626fe1fc71fec82f507ee2f62d1aafb45e7f233a0d5958f837fbca21.png

bounds =[max([gs.time.min() for gs in small_set]), 
         min([gs.time.max() for gs in small_set])]

gs1 = ts_list[0].sel(time=slice(bounds[0], bounds[-1]))
# gs1 = gs1.convert_time_unit('ky BP')
gs1.plot()

(<Figure size 1000x400 with 1 Axes>,
 <Axes: xlabel='Time [yr AD]', ylabel='d18O [permil]'>)

../../_images/f9abf5971d345d63cbb1aa24fa3e3ac605070a30dda9deaf7e71180d08d70ca8.png

gs2 = ts_list[4].sel(time=slice(bounds[0], bounds[-1]))
# gs2 = gs2.convert_time_unit('ky BP')
gs2.plot()

(<Figure size 1000x400 with 1 Axes>,
 <Axes: xlabel='Time [yr AD]', ylabel='trsgi [nan]'>)

../../_images/e111717e3aa61a708a3ea969235605aed2e08e9143b4acff6cdea4128339017b.png

msc = pyleo.MultipleGeoSeries([gs1, gs2]).common_time()

nEns = 1000
n = len(msc.series_list[0].time)
slist = []
for _ in range(nEns):
    pert = pyleo.utils.tsmodel.random_time_axis(n,
                                         delta_t_dist='random_choice',
                                         param =[[-0.1,0,0.1],[0.02,0.96,0.02]])
    ts = msc.series_list[0].copy()
    ts.time = msc.series_list[0].time + pert
    slist.append(ts)
gs1e = pyleo.EnsembleSeries(slist)
gs1e.common_time().plot_envelope(figsize=(16,6))

(<Figure size 1600x600 with 1 Axes>,
 <Axes: xlabel='Time [yr AD]', ylabel='d18O [permil]'>)

../../_images/160b47639455eb82a3ed8646236c44f59698330f6876e2331eeb08e72fea80b9.png

nEns = 1000
n = len(msc.series_list[1].time)
slist = []
for _ in range(nEns):
    pert = pyleo.utils.tsmodel.random_time_axis(n,
                                         delta_t_dist='random_choice',
                                         param =[[-0.1,0,0.1],[0.02,0.96,0.02]])
    ts = msc.series_list[1].copy()
    ts.time = msc.series_list[1].time + pert
    slist.append(ts)
gs2e = pyleo.EnsembleSeries(slist)
gs2e.common_time().plot_envelope(figsize=(16,6))

(<Figure size 1600x600 with 1 Axes>,
 <Axes: xlabel='Time [yr AD]', ylabel='trsgi [nan]'>)

../../_images/d72c72cc1312eaf09d2c8090e7235ed28863dd89536918c8c2cf8da1f9fba53c.png

gs2e_m = gs2e.common_time().quantiles(qs=[0.5]).series_list[0] # this turns it into a Series
gs1e_m = gs1e.common_time().quantiles(qs=[0.5]).series_list[0] # this turns it into a Series
for method in ['built-in','ar1sim','phaseran',]:
    corr_m = gs2e_m.correlation(gs1e_m,method=method)
    print(corr_m)

  correlation    p-value  signif. (α: 0.05)
-------------  ---------  -------------------
   -0.0638035       0.04  True

Evaluating association on surrogate pairs: 100%|██████████| 1000/1000 [00:00<00:00, 14077.96it/s]

  correlation    p-value  signif. (α: 0.05)
-------------  ---------  -------------------
   -0.0638035       0.14  False

Evaluating association on surrogate pairs: 100%|██████████| 1000/1000 [00:00<00:00, 14637.66it/s]

  correlation    p-value  signif. (α: 0.05)
-------------  ---------  -------------------
   -0.0638035       0.14  False

corr_e = gs2e.correlation(gs1e, method='phaseran',number=1000)
corr_e.plot()

Looping over 1000 Series in the ensemble

100%|██████████| 1000/1000 [02:36<00:00,  6.40it/s]

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[62], line 2
      1 corr_e = gs2e.correlation(gs1e, method='phaseran',number=1000)
----> 2 corr_e.plot()

File ~/PycharmProjects/Pyleoclim_util/pyleoclim/core/correns.py:205, in CorrEns.plot(self, figsize, title, ax, savefig_settings, hist_kwargs, title_kwargs, xlim, alpha, multiple, clr_insignif, clr_signif, clr_signif_fdr, clr_percentile)
    199 r_signif_fdr = np.array(self.r)[self.signif_fdr]
    200 #r_stack = [r_insignif, r_signif, r_signif_fdr]
    201 #ax.hist(r_stack, stacked=True, **args)
    202 
    203 # put everything into a dataframe to be able to use seaborn
--> 205 data = np.empty((len(self.r),3)); data[:] = np.NaN
    206 col  = [f'p < {self.alpha} (w/ FDR)',f'p < {self.alpha} (w/o FDR)', f'p ≥ {self.alpha}']
    207 data[self.signif_fdr,0] = r_signif_fdr        

File ~/miniconda3/envs/hol_ccm_local_env/lib/python3.11/site-packages/numpy/__init__.py:400, in __getattr__(attr)
    397     raise AttributeError(__former_attrs__[attr], name=None)
    399 if attr in __expired_attributes__:
--> 400     raise AttributeError(
    401         f"`np.{attr}` was removed in the NumPy 2.0 release. "
    402         f"{__expired_attributes__[attr]}",
    403         name=None
    404     )
    406 if attr == "chararray":
    407     warnings.warn(
    408         "`np.chararray` is deprecated and will be removed from "
    409         "the main namespace in the future. Use an array with a string "
    410         "or bytes dtype instead.", DeprecationWarning, stacklevel=2)

AttributeError: `np.NaN` was removed in the NumPy 2.0 release. Use `np.nan` instead.

../../_images/28d89d8f1e69432821cf8c034a416e4346ac3bc284a7a32f7b43395fa4c58998.png