icenet.data package

icenet.data package#

Subpackages#

Submodules#

icenet.data.cli module#

icenet.data.cli.add_date_args(arg_parser: object)[source]#

Parameters:: arg_parser

icenet.data.cli.csv_arg(string: str) → list[source]#

Parameters:: string
Returns:

icenet.data.cli.csv_of_csv_arg(string: str) → list[source]#

Parameters:: string
Returns:

icenet.data.cli.date_arg(string: str) → object[source]#

Parameters:: string
Returns:

icenet.data.cli.dates_arg(string: str) → object[source]#

Parameters:: string
Returns:

icenet.data.cli.download_args(choices: object = None, dates: bool = True, dates_optional: bool = False, var_specs: bool = True, workers: bool = False, extra_args: object = ()) → object[source]#

Parameters:

choices
dates
dates_optional
var_specs
workers
extra_args

Returns:

icenet.data.cli.int_or_list_arg(string: str) → object[source]#

Parameters:: string
Returns:

icenet.data.cli.process_args(dates: bool = True, ref_option: bool = True, extra_args: object = ()) → object[source]#

Parameters:

dates
ref_option
extra_args

Returns:

icenet.data.cli.process_date_args(args: object) → dict[source]#

Parameters:: args
Returns:

icenet.data.dataset module#

class icenet.data.dataset.IceNetDataSet(configuration_path: str, *args, batch_size: int = 4, path: str = './network_datasets', shuffling: bool = False, **kwargs)[source]#

Bases: SplittingMixin, DataCollection

Initialises and configures a dataset.

It loads a JSON configuration file, updates the _config attribute with the result, creates a data loader, and methods to access the dataset.

_config#: A dict used to store configuration loaded from JSON file.

_configuration_path#: The path to the JSON configuration file.

_batch_size#

The batch size for the data loader.

Type:: int

_counts#: A dict with number of elements in train, val, test.

_dtype#

The type of the dataset.

Type:: object

_loader_config#: The path to the data loader configuration file.

_generate_workers#: An integer representing number of workers for parallel processing with Dask.

_n_forecast_days#

An integer representing number of days to predict for.

Type:: int

_num_channels#

An integer representing number of channels (input variables) in the dataset.

Type:: int

_shape#

The shape of the dataset.

Type:: int

_shuffling#

A flag indicating whether to shuffle the data or not.

Type:: bool

property channels: list#: The list of channels (variable names) specified in the dataset config file.

property counts: dict#: A dict with number of elements in train, val, test in the config file.

get_data_loader(n_forecast_days: object = None, generate_workers: object = None) → object[source]#

Create an instance of the IceNetDataLoader class.

Parameters:

n_forecast_days (optional) – The number of forecast days to be used by the data loader. If not provided, defaults to the value specified in the configuration file.
generate_workers (optional) – An integer representing number of workers to use for parallel processing with Dask. If not provided, defaults to the value specified in the configuration file.

Returns:

An instance of the DaskMultiWorkerLoader class configured with the specified parameters.

property loader_config: str#: The path to the JSON loader configuration file stored in the dataset config file.

class icenet.data.dataset.MergedIceNetDataSet(configuration_paths: object, *args, batch_size: int = 4, path: str = './network_datasets', shuffling: bool = False, **kwargs)[source]#

Bases: SplittingMixin, DataCollection

Parameters:

identifier
configuration_paths – List of configurations to load
batch_size
path

property channels#

check_dataset(split: str = 'train')[source]#

Parameters:: split

property counts#

get_data_loader()[source]#

Returns:

icenet.data.dataset.check_dataset() → None[source]#: Check the dataset for a specific split.

icenet.data.dataset.get_args() → object[source]#

Parse command line arguments using the argparse module.

Returns:: An object containing the parsed command line arguments.

Example

Assuming CLI arguments provided.

args = get_args() print(args.dataset) print(args.split) print(args.verbose)

icenet.data.loader module#

icenet.data.loader.create()[source]#

icenet.data.loader.create_get_args() → object[source]#

Converts input data creation argument strings to objects, and assigns them as attributes to the namespace.

The args added in this function relate to the dataloader creation process.

Returns:

An argparse.ArgumentParser object with all arguments added via add_argument accessible: as object attributes.

icenet.data.loader.save_sample(output_folder: str, date: object, sample: tuple)[source]#

Parameters:

output_folder
date
sample

icenet.data.process module#

class icenet.data.process.IceNetPreProcessor(abs_vars, anom_vars, name, train_dates, val_dates, test_dates, *args, data_shape=(432, 432), dtype=<class 'numpy.float32'>, exclude_vars=(), file_filters=('latlon_', ), identifier=None, linear_trends=('siconca', ), linear_trend_steps=7, meta_vars=(), missing_dates=(), minmax=True, no_normalise=('siconca', ), path='./processed', parallel_opens=False, ref_procdir=None, source_data='./data', update_key=None, update_loader=True, **kwargs)[source]#

Bases: Processor

Parameters:

abs_vars
anom_vars
name
train_dates
val_dates
test_dates
*args –
data_shape
dtype
exclude_vars
file_filters
identifier
linear_trends
linear_trend_days
meta_vars
missing_dates
minmax
no_normalise
path
parallel_opens
ref_procdir
source_data
update_key
update_loader

DATE_FORMAT = '%Y_%m_%d'#

static mean_and_std(array: object)[source]#: Return the mean and standard deviation of an array-like object (intended use case is for normalising a raw satellite data array based on a list of samples used for training). :param array: :return:

property missing_dates#

post_normalisation(var_name: str, da: object)[source]#

Parameters:

var_name
da

Returns:

pre_normalisation(var_name: str, da: object)[source]#

Parameters:

var_name
da

Returns:

process()[source]#

update_loader_config()[source]#

Returns:

icenet.data.producers module#

class icenet.data.producers.DataCollection(*args, identifier: object = None, north: bool = True, south: bool = False, path: str = './data', **kwargs)[source]#

Bases: HemisphereMixin

An Abstract base class with common interface for data collection classes.

_identifier#: The identifier of the data collection.

_path#: The base path of the data collection.

_hemisphere#

The hemisphere(s) of the data collection.

Type:: int

property base_path: str#: The base path of the data collection.

property identifier: object#: The identifier (label) for this data collection.

class icenet.data.producers.DataProducer(*args, dry: bool = False, overwrite: bool = False, **kwargs)[source]#

Bases: DataCollection

Manages the creation and organisation of data files.

dry#: Flag specifying whether the data producer should be in dry run mode or not.

overwrite#: Flag specifying whether existing files should be overwritten or not.

get_data_var_folder(var: str, append: object = None, hemisphere: object = None, missing_error: bool = False) → str[source]#

Returns the path for a specific data variable.

Appends additional folders to the path if specified in the append parameter.

Parameters:

var – The data variable.
append (optional) – Additional folders to append to the path. Defaults to None.
hemisphere (optional) – The hemisphere. Defaults to None.
missing_error (optional) – Flag to specify if missing directories should be treated as an error. Defaults to False.

Returns:

The path for the specific data variable.

Return type:

str

class icenet.data.producers.Downloader(*args, **kwargs)[source]#

Bases: DataProducer

Abstract base class for a downloader.

abstractmethod download()[source]#: Abstract download method for this downloader: Must be implemented by subclasses.

class icenet.data.producers.Generator(*args, **kwargs)[source]#

Bases: DataProducer

Abstract base class for a generator.

abstractmethod generate()[source]#: Abstract generate method for this generator: Must be implemented by subclasses.

class icenet.data.producers.Processor(identifier: str, source_data: object, *args, file_filters: object = (), lead_time: int = 93, test_dates: object = (), train_dates: object = (), val_dates: object = (), **kwargs)[source]#

Bases: DataProducer

An abstract base class for data processing classes.

Provides methods for initialising source data, processing the data, and: saving the processed data to standard netCDF files.

_file_filters#: List of file filters to exclude certain files during data processing.

_lead_time#: Forecast/lead time used in the data processing.

source_data#: Path to the source data directory.

_var_files#: Dictionary storing variable files organised by variable name.

_processed_files#: Dictionary storing the processed files organised by variable name.

_dates#: Named tuple that stores the dates used for training, validation, and testing.

property dates: object#: The dates used for training, validation, and testing in this class as a named collections.tuple.

init_source_data(lag_days: object = None) → None[source]#

Initialises source data by globbing the files and organising based on date. Adds previous n days of lag_days if not already in self._dates

if lag_days>0.

Adds next n days of self._lead_time if not already in self._dates: if self._lead_time>0.

Parameters:: lag_days – The number of lag days to include in the data processing.
Returns:: None. The method updates the _var_files attribute of the Processor object.
Raises:: OSError – If the source data directory does not exist.

property lead_time: int#: The lead time used in the data processing.

abstractmethod process()[source]#: Abstract method defining data processing: Must be implemented by subclasses.

property processed_files: dict#: A dict with the processed files organised by variable name.

save_processed_file(var_name: str, name: str, data: object, **kwargs) → str[source]#

Save processed data to netCDF file.

Parameters:

var_name – The name of the variable.
name – The name of the file.
data – The data to be saved.
**kwargs – Additional keyword arguments to be passed to the get_data_var_folder method.

Returns:

The path of the saved netCDF file.

property source_data: str#: The source data directory as a string.

icenet.data.utils module#

icenet.data.utils.assign_lat_lon_coord_system(cube: object)[source]#

Assign coordinate system to iris cube to allow regridding.

Parameters:: cube

icenet.data.utils.esgf_search(server: str = 'https://esgf-node.llnl.gov/esg-search/search', files_type: str = 'OPENDAP', local_node: bool = False, latest: bool = True, project: str = 'CMIP6', format: str = 'application%2Fsolr%2Bjson', use_csrf: bool = False, **search)[source]#

Below taken from https://hub.binder.pangeo.io/user/pangeo-data-pan–cmip6-examples-ro965nih/lab and adapted slightly

Parameters:

server
files_type
local_node
latest
project
format
use_csrf
search

Returns:

icenet.data.utils.gridcell_angles_from_dim_coords(cube: object)[source]#

Author: Tony Phillips (BAS)

Wrapper for gridcell_angles() that derives the 2D X and Y lon/lat coordinates from 1D X and Y coordinates identifiable as ‘x’ and ‘y’ axes

The provided cube must have a coordinate system so that its X and Y coordinate bounds (which are derived if necessary) can be converted to lons and lats

Parameters:: cube
Returns:

icenet.data.utils.invert_gridcell_angles(angles: object)[source]#

Author: Tony Phillips (BAS)

Negate a cube of gridcell angles in place, transforming gridcell_angle_from_true_east <–> true_east_from_gridcell_angle :param angles:

icenet.data.utils.rotate_grid_vectors(u_cube: object, v_cube: object, angles: object)[source]#

Author: Tony Phillips (BAS)

Wrapper for rotate_grid_vectors() that can rotate multiple masked spatial fields in one go by iterating over the horizontal spatial axes in slices

Parameters:

u_cube
v_cube
angles

Returns:

icenet.data package

Contents

icenet.data package#

Subpackages#

Submodules#

icenet.data.cli module#

icenet.data.dataset module#

icenet.data.loader module#

icenet.data.process module#

icenet.data.producers module#

icenet.data.utils module#

Module contents#