%%bash
export DATA_DIR=~/scratch/data/Predictability/BatchSimulationSlim
ncdump -h $DATA_DIR/module_00/20190619_module_00_analog.nc

netcdf \20190619_module_00_analog {
dimensions:
	member = 21 ;
	lead_time = 27 ;
	grid = 56776 ;
variables:
	double power(member, lead_time, grid) ;
		power:_FillValue = NaN ;
		power:coordinates = "init_time" ;
	uint64 lead_time(lead_time) ;
	uint64 init_time ;
		init_time:units = "seconds since 1970-01-01" ;
		init_time:calendar = "proleptic_gregorian" ;

// global attributes:
		:Authors = "Weiming Hu, Guido Cervone" ;
		:Email = "geolab@psu.edu" ;
		:Institute = "The Pennsylvania State University" ;
		:Laboratory = "Geoinformatics and Earth Observation Laboratory (GEOlab)" ;
		:Institute_Link = "http://geolab.psu.edu" ;
		:Package = "Parallel Analog Ensemble" ;
		:Package_Link = "https://weiming-hu.github.io/AnalogsEnsemble" ;
}


%%bash
export DATA_DIR=~/scratch/data/Predictability/BatchSimulationSlim
ncdump -h $DATA_DIR/coordinates.nc

netcdf coordinates {
dimensions:
	grid = 56776 ;
variables:
	double Xs(grid) ;
		Xs:_FillValue = NaN ;
	double Ys(grid) ;
		Ys:_FillValue = NaN ;

// global attributes:
		:Authors = "Weiming Hu, Guido Cervone" ;
		:Email = "geolab@psu.edu" ;
		:Institute = "The Pennsylvania State University" ;
		:Laboratory = "Geoinformatics and Earth Observation Laboratory (GEOlab)" ;
		:Institute_Link = "http://geolab.psu.edu" ;
		:Package = "Parallel Analog Ensemble" ;
		:Package_Link = "https://weiming-hu.github.io/AnalogsEnsemble" ;
}


# For interacting with the operating system
import os

# For extracting string patterns
import glob

# For interacting with tables
import pandas as pd

# For interacting with NetCDF files
import xarray as xr

# For projection
import cartopy
import cartopy.crs as ccrs

# For visualization
import matplotlib.pyplot as plt

# For date and time definition
from datetime import timedelta

# Define the root directory of data
data_dir = os.path.expanduser('~/scratch/data/Predictability/BatchSimulationSlim')

# Define the NetCDF file path for coordinates
file_coordinate = os.path.join(data_dir, 'coordinates.nc')

# Define the number of seconds in an hour for readable conversion from seconds to hours
# because lead times are stored in seconds. However, code will be easier to read in hours.
#
sec_in_hour = 3600


# Read module information table
module_file = os.path.join(data_dir, 'modules.csv')
modules = pd.read_csv(module_file)

# Print the first three rows of the table
#
# - MP: Maximum Power
# - STC: Power under the standard test condition
#
modules.head(n=3)


# Select a module ID
module_id = 0

# Select a simulation date
simulation_date = '20190801'

# Define the ensemble file path
file_ensemble = os.path.join(data_dir, 'module_{:02d}/{}_module_{:02d}_analog.nc'.format(module_id, simulation_date, module_id))


# Read coordinates
coords = xr.open_dataset(file_coordinate)

# Read the simulation ensemble
ensemble = xr.open_dataset(file_ensemble)

# Merge coordinates with the ensemble simulation
ensemble = ensemble.merge(coords)


print('The module is {}\n'.format(modules['Tag'].iloc[module_id]))

ensemble

The module is SunPower_128_Cell_Module__2009__E__

<xarray.Dataset>
Dimensions:    (grid: 56776, lead_time: 27, member: 21)
Coordinates:
  * lead_time  (lead_time) uint64 21600 25200 28800 ... 108000 111600 115200
    init_time  datetime64[ns] 2019-08-01
Dimensions without coordinates: grid, member
Data variables:
    power      (member, lead_time, grid) float64 ...
    Xs         (grid) float64 262.5 262.6 262.2 262.4 ... 247.1 247.2 247.2
    Ys         (grid) float64 25.93 25.93 26.04 26.04 ... 45.42 45.64 45.74
Attributes:
    Authors:         Weiming Hu, Guido Cervone
    Email:           geolab@psu.edu
    Institute:       The Pennsylvania State University
    Laboratory:      Geoinformatics and Earth Observation Laboratory (GEOlab)
    Institute_Link:  http://geolab.psu.edu
    Package:         Parallel Analog Ensemble
    Package_Link:    https://weiming-hu.github.io/AnalogsEnsemble

array([ 21600,  25200,  28800,  32400,  36000,  39600,  43200,  46800,  50400,
        54000,  57600,  61200,  64800,  68400,  72000,  75600,  79200,  82800,
        86400,  90000,  93600,  97200, 100800, 104400, 108000, 111600, 115200],
      dtype=uint64)

array('2019-08-01T00:00:00.000000000', dtype='datetime64[ns]')

[32191992 values with dtype=float64]

array([262.481842, 262.603714, 262.235626, ..., 247.070504, 247.176784,
       247.157614])

array([25.93225 , 25.934237, 26.037575, ..., 45.424149, 45.640242, 45.741457])


# Define the lead time hour to visualize.
# Note all times are in UTC.
#
visual_hour = 20

# Create a plot object with a predefined projection
fig, ax = plt.subplots(1, 1, figsize=(15, 7), subplot_kw=dict(projection=ccrs.Orthographic(-100, 30)))

# Data subset and visualization in one line
ensemble.sel(lead_time=visual_hour*sec_in_hour).mean('member', skipna=True).plot.scatter(
    x='Xs', y='Ys', hue='power', s=1, cmap='Spectral_r', ax=ax,
    transform=ccrs.PlateCarree(), cbar_kwargs={'label': 'Power Ensemble Mean [W*h]'})

# Add grid lines
ax.gridlines()

# Show the figure (this line is sometimes optional)
fig.show()


# For distributed computing
import distributed

# For managing compute nodes on a cluster
import dask_jobqueue

# Define your project account to charge
project = '******'


# Configure workers
cluster = dask_jobqueue.PBSCluster(cores=1, processes=1, memory="40GB", walltime='1:00:00', project=project, queue='regular')

# Request workers
cluster.scale(10)

# Create a backend client for parallelization
client = distributed.Client(cluster)


# Check available client. You might want to wait for a while before jobs start running.
# If everything works, you should see 10 workers available under Cluster --- Workers.
#
client

# If for some reason, the number of workers never reach the demanded number,
# you can manually scale up the number of workers to fix this issue.
#
# But make sure you wait for a while and use `qstat -u user_name` to confirm no jobs
# are pending.
#
# cluster.scale_up(10)


# Extract all NetCDF files for a particular module ID
nc_files = glob.glob(os.path.join(data_dir, 'module_{:02d}/*.nc'.format(module_id)))
nc_files.sort()

print('There are {} daily files for the module {:02d}.'.format(len(nc_files), module_id))

There are 359 daily files for the module 00.


# Establish a connection to all files in parallel. `ds` stands for 'dataset'.
ds = xr.open_mfdataset(nc_files, decode_cf=True, concat_dim='init_time', combine='nested', parallel=True)

# Add coordinates
ds = ds.merge(coords)

ds

<xarray.Dataset>
Dimensions:    (grid: 56776, init_time: 359, lead_time: 27, member: 21)
Coordinates:
  * lead_time  (lead_time) uint64 21600 25200 28800 ... 108000 111600 115200
  * init_time  (init_time) datetime64[ns] 2019-01-01 2019-01-02 ... 2019-12-31
Dimensions without coordinates: grid, member
Data variables:
    power      (init_time, member, lead_time, grid) float64 dask.array<chunksize=(1, 21, 27, 56776), meta=np.ndarray>
    Xs         (grid) float64 262.5 262.6 262.2 262.4 ... 247.1 247.2 247.2
    Ys         (grid) float64 25.93 25.93 26.04 26.04 ... 45.42 45.64 45.74
Attributes:
    Authors:         Weiming Hu, Guido Cervone
    Email:           geolab@psu.edu
    Institute:       The Pennsylvania State University
    Laboratory:      Geoinformatics and Earth Observation Laboratory (GEOlab)
    Institute_Link:  http://geolab.psu.edu
    Package:         Parallel Analog Ensemble
    Package_Link:    https://weiming-hu.github.io/AnalogsEnsemble

array([ 21600,  25200,  28800,  32400,  36000,  39600,  43200,  46800,  50400,
        54000,  57600,  61200,  64800,  68400,  72000,  75600,  79200,  82800,
        86400,  90000,  93600,  97200, 100800, 104400, 108000, 111600, 115200],
      dtype=uint64)

array(['2019-01-01T00:00:00.000000000', '2019-01-02T00:00:00.000000000',
       '2019-01-03T00:00:00.000000000', ..., '2019-12-29T00:00:00.000000000',
       '2019-12-30T00:00:00.000000000', '2019-12-31T00:00:00.000000000'],
      dtype='datetime64[ns]')

array([262.481842, 262.603714, 262.235626, ..., 247.070504, 247.176784,
       247.157614])

array([25.93225 , 25.934237, 26.037575, ..., 45.424149, 45.640242, 45.741457])


# Let's slice data for a particular data on a particular lead time hour, and calculate the ensemble mean
my_date = pd.to_datetime('2019-08-01')
my_lead_time = 16 * sec_in_hour
day_power = ds['power'].sel(init_time=my_date, lead_time=my_lead_time).mean('member').sum()


# Convert time zone for readability
my_date = my_date.tz_localize('UTC').tz_convert('America/New_York')


# Fire the computation
print('The total power production at {} is {:0.2f} MW*h.'.format(
    (my_date + timedelta(seconds=my_lead_time)).strftime(format='%Y/%m/%d %H:%M:%S %Z'),
    day_power.compute().data.item() / 10e6))

The total power production at 2019/08/01 12:00:00 EDT is 2.21 MW*h.


# Calculate annual power production from ensemble mean
plot_data = ds.mean('member').sum(['init_time', 'lead_time']).compute()


# Create a plot object with a predefined projection
fig, ax = plt.subplots(1, 1, figsize=(7.5, 3.8), subplot_kw=dict(projection=ccrs.Orthographic(-100, 30)))

# Add coastlines
ax.add_feature(cartopy.feature.COASTLINE, edgecolor='grey')

# Add a layer for power production
plot_data.plot.scatter(
    x='Xs', y='Ys', hue='power', s=1, cmap='Spectral_r', ax=ax,
    transform=ccrs.PlateCarree(), cbar_kwargs={'label': 'Annual Power Production Ensemble Mean [W*h]'})

# Add grid lines
ax.gridlines()

# Adjust margins
fig.subplots_adjust(left=0.01, right=0.99, bottom=0.01, top=0.95)

# Add annotation
ax.text(0.006, 0.01, 'Module {:02d}:\n{}'.format(module_id, modules['Tag'].iloc[module_id]),
        horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes)

fig.show()
# fig.savefig('Figure_AnnualProduction.png', dpi=300)
# plt.close(fig)


# Calculate the min, median, and max of daily power production over CONUS
plot_data_median =  ds['power'].median('member').sum(['lead_time', 'grid']).compute()
plot_data_min =  ds['power'].min('member').sum(['lead_time', 'grid']).compute()
plot_data_max =  ds['power'].max('member').sum(['lead_time', 'grid']).compute()


# Create a plot object
fig, ax = plt.subplots(1, 1, figsize=(7, 3.3))

# Add range
ax.fill_between(ds['init_time'], plot_data_min, plot_data_max, facecolor='lightgrey',
                linewidth=1, zorder=7, label='Range', alpha=0.8)

# Add line
ax.plot(ds['init_time'], plot_data_median, zorder=8, c='red', linewidth=1, label='Median')

# Add grid lines
ax.grid()

# Adjust margins
fig.subplots_adjust(left=0.1, right=0.99, bottom=0.08, top=0.99)

# Add labels
ax.set_ylabel('Daily Power Production over CONUS [W*h]')

# Add legend
ax.legend(loc='upper right')

# Add annotation
ax.text(0.006, 0.01, 'Module {:02d}: {}'.format(module_id, modules['Tag'].iloc[module_id]),
        horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes)

fig.show()
# fig.savefig('Figure_DailyProduction.png', dpi=300)
# plt.close(fig)

	ID	Tag	Code	Area	Material	Cells_in_Series	Year	MP	Label	STC	Efficiency	EffeciencyLink	InfoLink
0	0	SunPower_128_Cell_Module__2009__E__	SP128	2.144	c-Si	128	2009	400.221000	0	400.0	0.1870	https://pvfree.herokuapp.com/pvmodules/1491/	https://us.sunpower.com/sites/default/files/sp...
1	1	First_Solar_FS_272___2009_	FS272	0.720	CdTe	116	2009	71.265600	1	72.5	0.1007	https://pvfree.herokuapp.com/pvmodules/1171/	http://www.solardesigntool.com/components/modu...
2	2	Solar_Frontier_SF_160S__2013_	SF160S	1.220	CIS	172	2013	159.100997	2	160.0	0.1320	https://pvfree.herokuapp.com/pvmodules/1567/	http://www.solardesigntool.com/components/modu...

Example for Interacting with the Dataset¶

Table of Contents¶

Interacte with a Single File¶

Prepare Environment¶

Preview Module Information¶

Visualize Daily Simulations¶

Analysis at Scale¶

Request Computing Resources¶

Client

Cluster

Visualize Data from Multiple NetCDF Files¶