Simulation results are store in NetCDF, individually for each day. Simulations with a particular panel configuration are stored under an individual directory. For example, <DATA_DIR>/module_00/20190619_module_00_analog.nc
is the daily simulation file for 06/19/2019 with the module tag 00
. Detailed information on module tags can be found in modules.csv
.
A summary of the variables included in the NetCDF file is provided below:
%%bash
export DATA_DIR=~/scratch/data/Predictability/BatchSimulationSlim
ncdump -h $DATA_DIR/module_00/20190619_module_00_analog.nc
netcdf \20190619_module_00_analog { dimensions: member = 21 ; lead_time = 27 ; grid = 56776 ; variables: double power(member, lead_time, grid) ; power:_FillValue = NaN ; power:coordinates = "init_time" ; uint64 lead_time(lead_time) ; uint64 init_time ; init_time:units = "seconds since 1970-01-01" ; init_time:calendar = "proleptic_gregorian" ; // global attributes: :Authors = "Weiming Hu, Guido Cervone" ; :Email = "geolab@psu.edu" ; :Institute = "The Pennsylvania State University" ; :Laboratory = "Geoinformatics and Earth Observation Laboratory (GEOlab)" ; :Institute_Link = "http://geolab.psu.edu" ; :Package = "Parallel Analog Ensemble" ; :Package_Link = "https://weiming-hu.github.io/AnalogsEnsemble" ; }
Please note that coordinates are not saved in simulation files to prevent redundancy. Coordinates can be found in a separate NetCDF file under the data root folder. These coordinates come from the NAM-NMM parent mesh grid with a 12 km horizontal resolution and they are subset to the continental US domain.
%%bash
export DATA_DIR=~/scratch/data/Predictability/BatchSimulationSlim
ncdump -h $DATA_DIR/coordinates.nc
netcdf coordinates { dimensions: grid = 56776 ; variables: double Xs(grid) ; Xs:_FillValue = NaN ; double Ys(grid) ; Ys:_FillValue = NaN ; // global attributes: :Authors = "Weiming Hu, Guido Cervone" ; :Email = "geolab@psu.edu" ; :Institute = "The Pennsylvania State University" ; :Laboratory = "Geoinformatics and Earth Observation Laboratory (GEOlab)" ; :Institute_Link = "http://geolab.psu.edu" ; :Package = "Parallel Analog Ensemble" ; :Package_Link = "https://weiming-hu.github.io/AnalogsEnsemble" ; }
# For interacting with the operating system
import os
# For extracting string patterns
import glob
# For interacting with tables
import pandas as pd
# For interacting with NetCDF files
import xarray as xr
# For projection
import cartopy
import cartopy.crs as ccrs
# For visualization
import matplotlib.pyplot as plt
# For date and time definition
from datetime import timedelta
# Define the root directory of data
data_dir = os.path.expanduser('~/scratch/data/Predictability/BatchSimulationSlim')
# Define the NetCDF file path for coordinates
file_coordinate = os.path.join(data_dir, 'coordinates.nc')
# Define the number of seconds in an hour for readable conversion from seconds to hours
# because lead times are stored in seconds. However, code will be easier to read in hours.
#
sec_in_hour = 3600
# Read module information table
module_file = os.path.join(data_dir, 'modules.csv')
modules = pd.read_csv(module_file)
# Print the first three rows of the table
#
# - MP: Maximum Power
# - STC: Power under the standard test condition
#
modules.head(n=3)
ID | Tag | Code | Area | Material | Cells_in_Series | Year | MP | Label | STC | Efficiency | EffeciencyLink | InfoLink | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | SunPower_128_Cell_Module__2009__E__ | SP128 | 2.144 | c-Si | 128 | 2009 | 400.221000 | 0 | 400.0 | 0.1870 | https://pvfree.herokuapp.com/pvmodules/1491/ | https://us.sunpower.com/sites/default/files/sp... |
1 | 1 | First_Solar_FS_272___2009_ | FS272 | 0.720 | CdTe | 116 | 2009 | 71.265600 | 1 | 72.5 | 0.1007 | https://pvfree.herokuapp.com/pvmodules/1171/ | http://www.solardesigntool.com/components/modu... |
2 | 2 | Solar_Frontier_SF_160S__2013_ | SF160S | 1.220 | CIS | 172 | 2013 | 159.100997 | 2 | 160.0 | 0.1320 | https://pvfree.herokuapp.com/pvmodules/1567/ | http://www.solardesigntool.com/components/modu... |
# Select a module ID
module_id = 0
# Select a simulation date
simulation_date = '20190801'
# Define the ensemble file path
file_ensemble = os.path.join(data_dir, 'module_{:02d}/{}_module_{:02d}_analog.nc'.format(module_id, simulation_date, module_id))
# Read coordinates
coords = xr.open_dataset(file_coordinate)
# Read the simulation ensemble
ensemble = xr.open_dataset(file_ensemble)
# Merge coordinates with the ensemble simulation
ensemble = ensemble.merge(coords)
print('The module is {}\n'.format(modules['Tag'].iloc[module_id]))
ensemble
The module is SunPower_128_Cell_Module__2009__E__
<xarray.Dataset> Dimensions: (grid: 56776, lead_time: 27, member: 21) Coordinates: * lead_time (lead_time) uint64 21600 25200 28800 ... 108000 111600 115200 init_time datetime64[ns] 2019-08-01 Dimensions without coordinates: grid, member Data variables: power (member, lead_time, grid) float64 ... Xs (grid) float64 262.5 262.6 262.2 262.4 ... 247.1 247.2 247.2 Ys (grid) float64 25.93 25.93 26.04 26.04 ... 45.42 45.64 45.74 Attributes: Authors: Weiming Hu, Guido Cervone Email: geolab@psu.edu Institute: The Pennsylvania State University Laboratory: Geoinformatics and Earth Observation Laboratory (GEOlab) Institute_Link: http://geolab.psu.edu Package: Parallel Analog Ensemble Package_Link: https://weiming-hu.github.io/AnalogsEnsemble
array([ 21600, 25200, 28800, 32400, 36000, 39600, 43200, 46800, 50400, 54000, 57600, 61200, 64800, 68400, 72000, 75600, 79200, 82800, 86400, 90000, 93600, 97200, 100800, 104400, 108000, 111600, 115200], dtype=uint64)
array('2019-08-01T00:00:00.000000000', dtype='datetime64[ns]')
[32191992 values with dtype=float64]
array([262.481842, 262.603714, 262.235626, ..., 247.070504, 247.176784, 247.157614])
array([25.93225 , 25.934237, 26.037575, ..., 45.424149, 45.640242, 45.741457])
# Define the lead time hour to visualize.
# Note all times are in UTC.
#
visual_hour = 20
# Create a plot object with a predefined projection
fig, ax = plt.subplots(1, 1, figsize=(15, 7), subplot_kw=dict(projection=ccrs.Orthographic(-100, 30)))
# Data subset and visualization in one line
ensemble.sel(lead_time=visual_hour*sec_in_hour).mean('member', skipna=True).plot.scatter(
x='Xs', y='Ys', hue='power', s=1, cmap='Spectral_r', ax=ax,
transform=ccrs.PlateCarree(), cbar_kwargs={'label': 'Power Ensemble Mean [W*h]'})
# Add grid lines
ax.gridlines()
# Show the figure (this line is sometimes optional)
fig.show()
The previous sections demonstrate how to read and visualize daily results with xarray
. However, it is also desirable to carry out time series analysis across multiple days. The problem is that the year-round simulation result for a module is about 88 GB. It is hard to fit this amount of data into RAM directly. We need help from distributing computing.
In this section, we show how to work with 88 GB directly of data with distributed computing with xarray
and Dask
on a cluster. This solution is platform-independent and can be deployed on major clusters/suptercomputers like the NCAR Cheyennne and the Penn State Roar. Dask
provides efficient and convenient parallelization for interacting with multiple NetCDF files.
Important: Here, I assume you are executing this notebook on a cluster with the PBS
scheduler. You should consult with your IT on which scheduler the cluster users. Currently, the supported schedulers by Dask
can be found here.
# For distributed computing
import distributed
# For managing compute nodes on a cluster
import dask_jobqueue
# Define your project account to charge
project = '******'
# Configure workers
cluster = dask_jobqueue.PBSCluster(cores=1, processes=1, memory="40GB", walltime='1:00:00', project=project, queue='regular')
# Request workers
cluster.scale(10)
# Create a backend client for parallelization
client = distributed.Client(cluster)
# Check available client. You might want to wait for a while before jobs start running.
# If everything works, you should see 10 workers available under Cluster --- Workers.
#
client
# If for some reason, the number of workers never reach the demanded number,
# you can manually scale up the number of workers to fix this issue.
#
# But make sure you wait for a while and use `qstat -u user_name` to confirm no jobs
# are pending.
#
# cluster.scale_up(10)
Client
|
Cluster
|
# Extract all NetCDF files for a particular module ID
nc_files = glob.glob(os.path.join(data_dir, 'module_{:02d}/*.nc'.format(module_id)))
nc_files.sort()
print('There are {} daily files for the module {:02d}.'.format(len(nc_files), module_id))
There are 359 daily files for the module 00.
# Establish a connection to all files in parallel. `ds` stands for 'dataset'.
ds = xr.open_mfdataset(nc_files, decode_cf=True, concat_dim='init_time', combine='nested', parallel=True)
# Add coordinates
ds = ds.merge(coords)
At this point, you have access to the year-round ensemble simulation data. Please take a look at the data summary below. Please notice the number of dates in init_time
. You can now easily subset and calculate daily/seasonal/annual statstics.
ds
<xarray.Dataset> Dimensions: (grid: 56776, init_time: 359, lead_time: 27, member: 21) Coordinates: * lead_time (lead_time) uint64 21600 25200 28800 ... 108000 111600 115200 * init_time (init_time) datetime64[ns] 2019-01-01 2019-01-02 ... 2019-12-31 Dimensions without coordinates: grid, member Data variables: power (init_time, member, lead_time, grid) float64 dask.array<chunksize=(1, 21, 27, 56776), meta=np.ndarray> Xs (grid) float64 262.5 262.6 262.2 262.4 ... 247.1 247.2 247.2 Ys (grid) float64 25.93 25.93 26.04 26.04 ... 45.42 45.64 45.74 Attributes: Authors: Weiming Hu, Guido Cervone Email: geolab@psu.edu Institute: The Pennsylvania State University Laboratory: Geoinformatics and Earth Observation Laboratory (GEOlab) Institute_Link: http://geolab.psu.edu Package: Parallel Analog Ensemble Package_Link: https://weiming-hu.github.io/AnalogsEnsemble
array([ 21600, 25200, 28800, 32400, 36000, 39600, 43200, 46800, 50400, 54000, 57600, 61200, 64800, 68400, 72000, 75600, 79200, 82800, 86400, 90000, 93600, 97200, 100800, 104400, 108000, 111600, 115200], dtype=uint64)
array(['2019-01-01T00:00:00.000000000', '2019-01-02T00:00:00.000000000', '2019-01-03T00:00:00.000000000', ..., '2019-12-29T00:00:00.000000000', '2019-12-30T00:00:00.000000000', '2019-12-31T00:00:00.000000000'], dtype='datetime64[ns]')
|
array([262.481842, 262.603714, 262.235626, ..., 247.070504, 247.176784, 247.157614])
array([25.93225 , 25.934237, 26.037575, ..., 45.424149, 45.640242, 45.741457])
Under the hood, data are not actually read into the memory yet. ds
is just a representation of the data structure from the multiple NetCDF files. When you define your calculation and call compute
, that is when the actual computation and file I/O happen.
# Let's slice data for a particular data on a particular lead time hour, and calculate the ensemble mean
my_date = pd.to_datetime('2019-08-01')
my_lead_time = 16 * sec_in_hour
day_power = ds['power'].sel(init_time=my_date, lead_time=my_lead_time).mean('member').sum()
# Convert time zone for readability
my_date = my_date.tz_localize('UTC').tz_convert('America/New_York')
# Fire the computation
print('The total power production at {} is {:0.2f} MW*h.'.format(
(my_date + timedelta(seconds=my_lead_time)).strftime(format='%Y/%m/%d %H:%M:%S %Z'),
day_power.compute().data.item() / 10e6))
The total power production at 2019/08/01 12:00:00 EDT is 2.21 MW*h.
# Calculate annual power production from ensemble mean
plot_data = ds.mean('member').sum(['init_time', 'lead_time']).compute()
# Create a plot object with a predefined projection
fig, ax = plt.subplots(1, 1, figsize=(7.5, 3.8), subplot_kw=dict(projection=ccrs.Orthographic(-100, 30)))
# Add coastlines
ax.add_feature(cartopy.feature.COASTLINE, edgecolor='grey')
# Add a layer for power production
plot_data.plot.scatter(
x='Xs', y='Ys', hue='power', s=1, cmap='Spectral_r', ax=ax,
transform=ccrs.PlateCarree(), cbar_kwargs={'label': 'Annual Power Production Ensemble Mean [W*h]'})
# Add grid lines
ax.gridlines()
# Adjust margins
fig.subplots_adjust(left=0.01, right=0.99, bottom=0.01, top=0.95)
# Add annotation
ax.text(0.006, 0.01, 'Module {:02d}:\n{}'.format(module_id, modules['Tag'].iloc[module_id]),
horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes)
fig.show()
# fig.savefig('Figure_AnnualProduction.png', dpi=300)
# plt.close(fig)
# Calculate the min, median, and max of daily power production over CONUS
plot_data_median = ds['power'].median('member').sum(['lead_time', 'grid']).compute()
plot_data_min = ds['power'].min('member').sum(['lead_time', 'grid']).compute()
plot_data_max = ds['power'].max('member').sum(['lead_time', 'grid']).compute()
# Create a plot object
fig, ax = plt.subplots(1, 1, figsize=(7, 3.3))
# Add range
ax.fill_between(ds['init_time'], plot_data_min, plot_data_max, facecolor='lightgrey',
linewidth=1, zorder=7, label='Range', alpha=0.8)
# Add line
ax.plot(ds['init_time'], plot_data_median, zorder=8, c='red', linewidth=1, label='Median')
# Add grid lines
ax.grid()
# Adjust margins
fig.subplots_adjust(left=0.1, right=0.99, bottom=0.08, top=0.99)
# Add labels
ax.set_ylabel('Daily Power Production over CONUS [W*h]')
# Add legend
ax.legend(loc='upper right')
# Add annotation
ax.text(0.006, 0.01, 'Module {:02d}: {}'.format(module_id, modules['Tag'].iloc[module_id]),
horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes)
fig.show()
# fig.savefig('Figure_DailyProduction.png', dpi=300)
# plt.close(fig)