PPE: Last Millennium Reanalysis

PPE: Last Millennium Reanalysis#

[1]:
%load_ext autoreload
%autoreload 2

import os
os.chdir('/glade/work/fengzhu/GitHub/cfr/docsrc/v2026/notebooks')
import pandas as pd
import xarray as xr
import xesmf as xe
import numpy as np
import cfr
cfr.use('v2026')
print(cfr.__version__)
2026.4.8

Pseudoproxies Setup#

[23]:
df_raw = pd.read_json('https://github.com/fzhu2e/cfr-data/raw/refs/heads/main/pages2kv2.json')
df = df_raw.rename(columns={
    'paleoData_pages2kID': 'pid',
    'geo_meanLat': 'lat',
    'geo_meanLon': 'lon',
    'archiveType': 'archive',
})[['pid', 'lat', 'lon', 'archive']]

# mask = df['archive'].isin(['marine sediment', 'coral', 'sclerosponge', 'bivalve'])
# obs = cfr.Obs(df[mask])
obs = cfr.Obs(df)
obs.df['time'] = object
obs.df['value'] = object
[2]:
dirpath = '/glade/work/fengzhu/Data/GCM_sims'
ds_prior = xr.open_dataset(os.path.join(dirpath, 'CCSM4', 'tas_sfc_Amon_CCSM4_past1000_085001-185012.nc'))
ds_truth = xr.open_dataset(os.path.join(dirpath, 'iCESM', 'LM', 'tas_sfc_Amon_iCESM_past1000historical_085001-200512.nc'))

ds_truth_ann = ds_truth.x.annualize(time2year=True) - 273.15
ds_truth_ann['tas'].attrs['units'] = 'degC'

ds_prior = ds_prior.drop_vars(['lat_bnds', 'lon_bnds', 'height'])
ds_prior_ann = ds_prior.x.annualize(time2year=True) - 273.15
ds_prior_ann['tas'].attrs['units'] = 'degC'

regridder = xe.Regridder(ds_prior_ann, ds_truth_ann, method='bilinear')
ds_prior_ann = regridder(ds_prior_ann)
ds_prior_ann['tas'].attrs = ds_truth_ann['tas'].attrs
[24]:
# a simple white noise model without data attrition
SNR = 10   # var(signal) / var(noise)
for idx, row in obs.df.iterrows():
    np.random.seed(idx)
    obs.df.at[idx, 'time'] = np.arange(850, 2006)
    obs.df.at[idx, 'SNR'] = SNR
    signal = ds_truth_ann.sel(
        lat=row['lat'],
        lon=row['lon'],
        method='nearest',
    ).tas.values
    signal_var = np.var(signal, ddof=1)
    noise_var = signal_var / SNR
    obs.df.at[idx, 'R'] = noise_var
    noise = np.random.normal(loc=0, scale=np.sqrt(noise_var), size=signal.shape)
    obs.df.at[idx, 'value'] = signal + noise

obs.df['type'] = 'T'
obs.df['psm_name'] = 'IdenticalTS'
obs.df
[24]:
pid lat lon archive time value SNR R type psm_name
0 NAm_153 52.70 241.70 tree [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [-0.02292751859725406, 0.3572480420356854, 0.7... 10.0 0.089662 T IdenticalTS
1 Asi_245 23.00 114.00 documents [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [21.125607818090135, 20.116943803724595, 19.96... 10.0 0.021568 T IdenticalTS
2 NAm_165 37.90 252.30 tree [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [6.338173780630901, 7.4904591259960585, 6.8232... 10.0 0.065712 T IdenticalTS
3 Asi_178 28.77 83.73 tree [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [7.1370074286774425, 5.966696792272471, 6.3234... 10.0 0.046389 T IdenticalTS
4 Asi_174 28.18 85.43 tree [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [10.911502987956526, 10.585710012500336, 10.76... 10.0 0.013729 T IdenticalTS
... ... ... ... ... ... ... ... ... ... ...
687 Asi_201 35.88 74.18 tree [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [-4.287775723319058, -4.232198470857572, -3.74... 10.0 0.047043 T IdenticalTS
688 Asi_179 27.50 88.02 tree [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [7.563148014053656, 7.277179376837475, 7.47118... 10.0 0.019030 T IdenticalTS
689 Arc_014 63.62 29.10 lake sediment [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [-1.0157051133439026, -0.25941108796600443, 1.... 10.0 0.212980 T IdenticalTS
690 Ocn_071 16.20 298.51 coral [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [25.283580513311232, 24.776707927084082, 24.91... 10.0 0.012692 T IdenticalTS
691 Ocn_072 16.20 298.51 coral [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [25.144609174862826, 24.651241206523522, 24.64... 10.0 0.012692 T IdenticalTS

692 rows × 10 columns

[25]:
np.unique(obs.df['archive'])
[25]:
array(['bivalve', 'borehole', 'coral', 'documents', 'glacier ice',
       'hybrid', 'lake sediment', 'marine sediment', 'sclerosponge',
       'speleothem', 'tree'], dtype=object)
[26]:
os.makedirs('./data', exist_ok=True)
obs.df.to_json('./data/df_proxy_ocn.json')
df_ocn = pd.read_json('./data/df_proxy_ocn.json')
print(type(df_ocn.loc[0, 'time']))
print(type(df_ocn.loc[0, 'value']))
df_ocn
<class 'list'>
<class 'list'>
[26]:
pid lat lon archive time value SNR R type psm_name
0 NAm_153 52.70 241.70 tree [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [-0.022927518600000002, 0.35724804200000004, 0... 10 0.089662 T IdenticalTS
1 Asi_245 23.00 114.00 documents [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [21.1256078181, 20.1169438037, 19.9622574808, ... 10 0.021568 T IdenticalTS
2 NAm_165 37.90 252.30 tree [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [6.3381737806, 7.490459126, 6.8232782039, 7.80... 10 0.065712 T IdenticalTS
3 Asi_178 28.77 83.73 tree [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [7.1370074287, 5.9666967923000005, 6.323457105... 10 0.046389 T IdenticalTS
4 Asi_174 28.18 85.43 tree [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [10.911502988, 10.5857100125, 10.7688062264, 1... 10 0.013729 T IdenticalTS
... ... ... ... ... ... ... ... ... ... ...
687 Asi_201 35.88 74.18 tree [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [-4.2877757233, -4.2321984709, -3.7413174658, ... 10 0.047043 T IdenticalTS
688 Asi_179 27.50 88.02 tree [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [7.5631480141, 7.2771793768, 7.471187646, 7.01... 10 0.019030 T IdenticalTS
689 Arc_014 63.62 29.10 lake sediment [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [-1.0157051133, -0.259411088, 1.4312935975, -2... 10 0.212980 T IdenticalTS
690 Ocn_071 16.20 298.51 coral [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [25.2835805133, 24.7767079271, 24.9194705363, ... 10 0.012692 T IdenticalTS
691 Ocn_072 16.20 298.51 coral [850, 851, 852, 853, 854, 855, 856, 857, 858, ... [25.1446091749, 24.6512412065, 24.6425996453, ... 10 0.012692 T IdenticalTS

692 rows × 10 columns

[27]:
obs.setup()
obs['Ocn_065'].data
[27]:
pid                                                   Ocn_065
lat                                                     25.84
lon                                                    281.38
archive                                                 coral
time        [850, 851, 852, 853, 854, 855, 856, 857, 858, ...
value       [24.868690830872385, 24.688057548488253, 24.62...
SNR                                                      10.0
R                                                    0.006686
type                                                        T
psm_name                                          IdenticalTS
Name: 12, dtype: object
[28]:
cfr.set_style('journal', font_scale=1.2)
fig, ax = obs.plot(
    levels=np.linspace(-40, 40, 21),
    cbar_kwargs={'ticks': np.linspace(-40, 40, 11)},
    cyclic=True,
    da_target=ds_truth_ann['tas'],
    t_idx=-1,
    title='Truth',
)
../_images/notebooks_da-ppe-transient_9_0.png

Prior Setup#

[29]:
pm = cfr.PriorMember(ds_prior_ann)
pm.gen_samples_bootstrap(nens=30, clim_yrs=1)
prior = cfr.Prior(pm)
prior.ds
[29]:
<xarray.Dataset> Size: 2MB
Dimensions:  (lat: 96, lon: 144, ens: 30)
Coordinates:
  * lat      (lat) float32 384B -90.0 -88.11 -86.21 -84.32 ... 86.21 88.11 90.0
  * lon      (lon) float32 576B 0.0 2.5 5.0 7.5 10.0 ... 350.0 352.5 355.0 357.5
Dimensions without coordinates: ens
Data variables:
    tas      (lat, lon, ens) float32 2MB -47.45 -47.06 -48.57 ... -23.29 -20.72
[30]:
fig, ax = prior.ds['tas'].mean('ens').x.plot(
    levels=np.linspace(-40, 40, 21),
    cbar_kwargs={'ticks': np.linspace(-40, 40, 11)},
    cyclic=True,
    title='Prior Ensemble Mean',
)
../_images/notebooks_da-ppe-transient_12_0.png

EnKF Solver Setup#

[31]:
prior.ds_rgd = prior.ds.copy()
prior.ds_ann = prior.ds.copy()

da_solver = cfr.enkf.Solver(prior, obs)
da_solver.prep(
    recon_period=(1801, 1850),
    loc_radius=10000,
)
>>> Proxy System Modeling: Y = H(X)
>>> Computing the localization matrix
Processing variables: 100%|██████████| 1/1 [00:00<00:00,  1.90it/s]
[32]:
# da_solver.run(nproc=1)
[33]:
%%time
da_solver.run(nproc=4, chunksize=10)
>>> DA update
Updating time slices: 100%|██████████| 50/50 [00:12<00:00,  4.06it/s]
CPU times: user 1.33 s, sys: 2.22 s, total: 3.55 s
Wall time: 12.8 s

[34]:
os.makedirs('./recons', exist_ok=True)
da_solver.post.to_netcdf(f'./recons/recon_LMR_SNR10_ocn_loc10000.nc')
da_solver.post
[34]:
<xarray.Dataset> Size: 166MB
Dimensions:  (time: 50, lat: 96, lon: 144, ens: 30)
Coordinates:
  * lat      (lat) float32 384B -90.0 -88.11 -86.21 -84.32 ... 86.21 88.11 90.0
  * lon      (lon) float32 576B 0.0 2.5 5.0 7.5 10.0 ... 350.0 352.5 355.0 357.5
  * time     (time) int64 400B 1801 1802 1803 1804 1805 ... 1847 1848 1849 1850
Dimensions without coordinates: ens
Data variables:
    tas      (time, lat, lon, ens) float64 166MB -53.43 -53.49 ... -23.44 -23.22

Validation#

[35]:
fig, ax =  da_solver.plot_valid(
    ds_truth_ann, ds_prior=ds_prior_ann, vn='tas', metric='corr',
    df_sites=obs.df[['lat', 'lon']],
    site_markersizes=10,
    legend=False,
    cyclic=True,
)
../_images/notebooks_da-ppe-transient_19_0.png
[36]:
fig, ax =  da_solver.plot_valid(
    ds_truth_ann, ds_prior=ds_prior_ann, vn='tas', metric='RMSE',
    df_sites=obs.df[['lat', 'lon']],
    site_markersizes=10,
    legend=False,
    cyclic=True,
)
../_images/notebooks_da-ppe-transient_20_0.png
[37]:
fig, ax =  da_solver.plot_valid(ds_truth_ann, ds_prior=ds_prior_ann, metric='nino3.4')
../_images/notebooks_da-ppe-transient_21_0.png
[ ]: