Generate pseudoproxies

Expected time to run through: 10 mins

This tutorial demonstrates how to generate pseudoproxies. We will leverage the spatiotemporal availability of the PAGES2k v2 dataset and the temperature field of iCESM. Pseudoproxies are generated as temperature plus white noise with certain signal-noise-ratio (SNR).

Test data preparation

To go through this tutorial, please prepare test data following the steps: 1. Download the test case named “pseudoPAGES2k_iCESM” with this link. 2. Create a directory named “testcases” in the same directory where this notebook sits. 3. Put the unzipped direcotry “pseudoPAGES2k_iCESM” into “testcases”.

Below, we first load some useful packages, including our LMRt.

[32]:
%load_ext autoreload
%autoreload 2

import LMRt
import GraphEM
import os
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as  plt
from tqdm import tqdm
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[2]:
proxy_dirpath = './testcases/pseudoPAGES2k_iCESM/data/proxy/'
proxy_filename = 'pages2k_dataset.pkl'
model_dirpath = './testcases/pseudoPAGES2k_iCESM/data/model/'
model_filename = 'tas_sfc_Amon_iCESM_past1000historical_085001-200512.nc'
[3]:
# load model simulation
ds = LMRt.Dataset()
ds.load_nc(
    {'tas': os.path.join(model_dirpath, model_filename)},
    varname_dict={'tas': 'tas'},
    inplace=True,
    anom_period=(1951, 1980),
)
ds.seasonalize(list(range(1, 13)), inplace=True)
print(ds)
Dataset Overview
-----------------------

     Name:  tas
   Source:  ./testcases/pseudoPAGES2k_iCESM/data/model/tas_sfc_Amon_iCESM_past1000historical_085001-200512.nc
    Shape:  time:1156, lat:96, lon:144
[24]:
# load proxy database
proxydb = LMRt.ProxyDatabase()

df_pages2k = pd.read_pickle(os.path.join(proxy_dirpath, proxy_filename))
proxydb.load_df(df_pages2k)


# seasonalize
ptype_season = {}
for k in proxydb.type_list:
    ptype_season[k] = list(range(1, 13))

proxydb.seasonalize(ptype_season, inplace=True)
proxydb.refresh()
print(proxydb)
Proxy Database Overview
-----------------------
     Source:        None
       Size:        692
Proxy types:        {'tree.TRW': 354, 'documents': 15, 'tree.MXD': 61, 'lake.midge': 5, 'lake.alkenone': 4, 'coral.calc': 8, 'ice.d18O': 39, 'coral.d18O': 67, 'lake.reflectance': 4, 'marine.alkenone': 22, 'marine.MgCa': 24, 'marine.foram': 4, 'lake.pollen': 11, 'coral.SrCa': 29, 'lake.varve_thickness': 8, 'ice.dD': 8, 'borehole': 3, 'lake.chrysophyte': 1, 'ice.melt': 2, 'lake.varve_property': 1, 'marine.d18O': 1, 'lake.chironomid': 3, 'marine.TEX86': 4, 'lake.BSi': 2, 'speleothem.d18O': 4, 'lake.TEX86': 2, 'marine.diatom': 2, 'hybrid': 1, 'lake.accumulation': 1, 'marine.MAT': 1, 'bivalve.d18O': 1}
[30]:
df_pages2k.columns
[30]:
Index(['paleoData_pages2kID', 'dataSetName', 'archiveType', 'geo_meanElev',
       'geo_meanLat', 'geo_meanLon', 'year', 'yearUnits',
       'paleoData_variableName', 'paleoData_units', 'paleoData_values',
       'paleoData_proxy'],
      dtype='object')
[25]:
# find nearest model grid cell to proxy locales
proxydb.find_nearest_loc(
    ['tas'], ds=ds, ds_type='prior',
    ds_loc_path='./testcases/pseudoPAGES2k_iCESM/loc_idx.pkl',
    save_path='./testcases/pseudoPAGES2k_iCESM/loc_idx.pkl',
)
[26]:
# get model variables for each proxy record
season_tag = '_'.join(str(s) for s in list(range(1, 13)))
proxydb.get_var_from_ds(
    {season_tag: ds},
    ptype_season, ds_type='prior')
[45]:
# gen pseudoproxy

# SNR = 1
SNR = 10

df_pp = df_pages2k.copy()
for idx, row in tqdm(df_pp.iterrows(), total=len(df_pp)):
    # get tas values
    pid = row['paleoData_pages2kID']
    pobj = proxydb.records[pid]
    tas_value = pobj.prior_value['tas'][season_tag]
    tas_time = pobj.prior_time['tas'][season_tag]

    # make pseudoproxy values
    np.random.seed(idx)
    nt = np.size(tas_value)
    tas_std = np.std(tas_value)
    noise_std = tas_std / SNR
    noise_value = np.random.normal(0, noise_std, size=nt)
    pp_value = tas_value + noise_value
    pp_time = np.copy(tas_time)

    # get realistic temporal availability
    pp_value_r = []
    pp_time_r = []
    for i, t in enumerate(pp_time):
        if t in pobj.time:
            pp_value_r.append(pp_value[i])
            pp_time_r.append(pp_time[i])

    row['year'] = np.array(pp_time_r)
    row['paleoData_values'] = np.array(pp_value_r)

# df_pp.to_pickle('./testcases/pseudoPAGES2k_iCESM/pseudoPAGES2k_dataset_tas_wn.pkl')
df_pp.to_pickle('./testcases/pseudoPAGES2k_iCESM/pseudoPAGES2k_dataset_tas_wn_SNR10.pkl')
100%|██████████| 692/692 [00:14<00:00, 47.12it/s]
[47]:
# gen pseudoproxy

# SNR = 1
SNR = 10

df_pp = df_pages2k.copy()
for idx, row in tqdm(df_pp.iterrows(), total=len(df_pp)):
    # get tas values
    pid = row['paleoData_pages2kID']
    pobj = proxydb.records[pid]
    tas_value = pobj.prior_value['tas'][season_tag]
    tas_time = pobj.prior_time['tas'][season_tag]

    # make pseudoproxy values
    np.random.seed(idx)
    nt = np.size(tas_value)
    tas_std = np.std(tas_value)
    noise_std = tas_std / SNR
    noise_value = np.random.normal(0, noise_std, size=nt)
    pp_value = tas_value + noise_value
    pp_time = np.copy(tas_time)

    row['year'] = np.array(pp_time)
    row['paleoData_values'] = np.array(pp_value)

# df_pp.to_pickle('./testcases/pseudoPAGES2k_iCESM/pseudoPAGES2k_dataset_tas_wn_full_temporal_availability.pkl')
df_pp.to_pickle('./testcases/pseudoPAGES2k_iCESM/pseudoPAGES2k_dataset_tas_wn_SNR10_full_temporal_availability.pkl')
100%|██████████| 692/692 [00:00<00:00, 3798.34it/s]
[42]:
print(ds)
Dataset Overview
-----------------------

     Name:  tas
   Source:  ./testcases/pseudoPAGES2k_iCESM/data/model/tas_sfc_Amon_iCESM_past1000historical_085001-200512.nc
    Shape:  time:1156, lat:96, lon:144
[43]:
import cftime

output_dict = {}
output_dict['tas'] = (('time', 'lat', 'lon'), ds.fields['tas'].value)

ds_out = xr.Dataset(
    data_vars=output_dict,
    coords={
        'time': [cftime.DatetimeNoLeap(y, 1, 1) for y in range(850, 2006)],
        'lat': ds.fields['tas'].lat,
        'lon': ds.fields['tas'].lon,
    }
)
ds_out.to_netcdf('./testcases/pseudoPAGES2k_iCESM/iCESM_ann.nc')
[ ]: