Module jidenn.data.JIDENNDataset
Module containing the JIDENNDataset
dataclass that is a wrapper for a TensorFlow dataset that allows for easy adding and processing of dataset files.
It contains all the necessary tools to perform a preprocessing of the jet dataset for training.
Expand source code
"""
Module containing the `JIDENNDataset` dataclass that is a wrapper for a TensorFlow dataset that allows for easy adding and processing of dataset files.
It contains all the necessary tools to perform a preprocessing of the jet dataset for training.
"""
from __future__ import annotations
import tensorflow as tf
import pandas as pd
from dataclasses import dataclass
from typing import Union, Literal, Callable, Dict, Tuple, List, Optional, Any
import os
import pickle
#
import jidenn.config.config as config
from jidenn.data.string_conversions import Cut, Expression
from jidenn.evaluation.plotter import plot_data_distributions
ROOTVariables = Dict[str, Union[tf.RaggedTensor, tf.Tensor]]
"""Type alias for a dictionary of ROOT variables. The keys are the variable names and the values are the corresponding
Tensorflow `tf.RaggedTensor` or `tf.Tensor`.
Example:
```python
variables = {
'jets_pt': tf.RaggedTensor([[1, 2, 3, 4, 5], [2, 3]], dtype=tf.float32),
'eventNumber': tf.Tensor([1, 2], dtype=tf.int32),
...
}
```
"""
@tf.function
def dict_to_stacked_array(data: Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], label: int, weight: Optional[float] = None) -> Tuple[Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]], int, Union[float, None]]:
"""Converts a `ROOTVariables` to a input for training a neural network, i.e. a tuple `(input, label, weight)`.
The `input` is construsted by **stacking all the variables** in `data` `ROOTVariables` dictionary into a single `tf.Tensor`.
Optionally, the input data can be a tuple of two ROOTVariables. The output has the form `((input1, input2), label, weight)`.
The `input2` is constructed by **stacking all the variables** in the second `ROOTVariables` dictionary into a single `tf.Tensor`.
Args:
data (ROOTVariables or tuple[ROOTVariables, ROOTVariables]): The input data.
label (int): The label.
weight (float, optional): The weight. Defaults to `None`.
Returns:
A tuple `(input, label, weight)` where `data` is a `tf.Tensor` or a tuple `((input1, input2), label, weight)`
in case `data` is a tuple of two ROOTVariables where `input1` and `input2` are `tf.Tensor`s.
"""
if isinstance(data, tuple):
interaction = tf.stack([data[1][var] for var in data[1]], axis=-1)
interaction = tf.where(tf.math.logical_or(tf.math.is_inf(interaction), tf.math.is_nan(interaction)),
tf.zeros_like(interaction), interaction)
if weight is None:
return (tf.stack([data[0][var] for var in data[0]], axis=-1), interaction), label
return (tf.stack([data[0][var] for var in data[0]], axis=-1), interaction), label, weight
else:
if weight is None:
return tf.stack([data[var] for var in data.keys()], axis=-1), label
return tf.stack([data[var] for var in data.keys()], axis=-1), label, weight
@dataclass
class JIDENNDataset:
"""The JIDENNDataset dataclass is a wrapper for a TensorFlow dataset that allows for easy loading and processing of dataset files
for jet identifiation using deep neural networks (**JIDENN**). The `tf.data.Dataset` is constructed from a `tf.data.Dataset`
consisting of `ROOTVariables` dictionaries.
The dataset can be loaded from a file using the `load_dataset` method or set manually using the `set_dataset` method.
Both methods require the `element_spec` either in a pickled file in the case of loading, or as dictionary of `tf.TensorSpec`
or `tf.RaggedTensorSpec` object in the case of setting the dataset manually.
Example:
Typical usage of the `JIDENNDataset` dataclass is as follows:
```python
import tensorflow as tf
from jidenn.config.config_subclasses import Variables
from .utils.Cut import Cut
@tf.function
def count_PFO(sample: ROOTVariables) -> ROOTVariables:
sample = sample.copy()
sample['jets_PFO_n'] = tf.reduce_sum(tf.ones_like(sample['jets_PFO_pt']))
return sample
@tf.function
def train_input(sample: ROOTVariables) -> ROOTVariables:
output = {
'N_PFO': sample['jets_PFO_n'],
'pt': sample['jets_pt'],
'width': sample['jets_Width'],
'EMFrac': sample['jets_EMFrac'],
'mu': sample['corrected_averageInteractionsPerCrossing[0]']
}
return output
variables = ['corrected_averageInteractionsPerCrossing[0]', 'jets_pt', 'jets_Width', 'jets_EMFrac','jets_PFO_pt']
jidenn_dataset = JIDENNDataset(variables=variables,
target='jets_TruthLabelID',
weight=None)
jidenn_dataset = jidenn_dataset.load_dataset('path/to/dataset')
jidenn_dataset = jidenn_dataset.create_variables(cut=Cut('jets_pt > 10_000'), map_dataset=count_PFO)
jidenn_dataset = jidenn_dataset.resample_dataset(lambda data, label: tf.cast(tf.greater(label, 0), tf.int32), [0.5, 0.5])
jidenn_dataset = jidenn_dataset.remap_labels(lambda data, label: tf.cast(tf.greater(label, 0), tf.int32))
jidenn_dataset = jidenn_dataset.create_train_input(train_input)
dataset = jidenn_dataset.get_prepared_dataset(batch_size=128,
shuffle_buffer_size=1000,
take=100_000,
assert_length=True)
model.fit(dataset, epochs=10)
```
Args:
variables (List[str]): The list of variables to be used in the dataset.
target (str, optional): The name of the target variable. Defaults to `None`.
weight (str, optional): The name of the weight variable. Defaults to `None`.
"""
variables: Optional[List[str]] = None
"""The configuration dataclass of the variables to be used in the dataset.
If `None`, the variables are set automatically during loading with JIDENNDataset.load()."""
target: Optional[str] = None
"""The name of the target variable. `None` if no target variable is used."""
weight: Optional[str] = None
"""The name of the weight variable. `None` if no weight variable is used."""
def __post_init__(self):
self._dataset = None
self._element_spec = None
def load_dataset(self, file: str) -> JIDENNDataset:
"""Loads a dataset from a file. The dataset is stored in the `tf.data.Dataset` format.
The `element_spec` is loaded from the `element_spec` file inside the dataset directory.
Alternatively, the `element_spec` can be loaded manually using the `load_element_spec` method.
Args:
file (str): The path to the dataset directory.
Returns:
JIDENNDataset: The JIDENNDataset object with set dataset and `element_spec`.
"""
if self.element_spec is None:
element_spec_file = os.path.join(file, 'element_spec')
jidenn_dataset = self.load_element_spec(element_spec_file)
else:
jidenn_dataset = self
dataset = tf.data.Dataset.load(
file, compression='GZIP', element_spec=jidenn_dataset.element_spec)
return jidenn_dataset._set_dataset(dataset)
@staticmethod
def load(path: str, element_spec_path: Optional[str] = None) -> JIDENNDataset:
"""Loads a dataset from a file. The dataset is stored in the `tf.data.Dataset` format.
The assumed dataset elements are `ROOTVariables` dictionaries or a tuple of `ROOTVariables`, `label` and `weight`.
Args:
path (str): The path to the dataset directory.
element_spec_path (str, optional): The path to the `element_spec` file. Defaults to `None`.
If `None`, the `element_spec` is loaded from the `element_spec` file inside the dataset directory.
Raises:
ValueError: If the `element_spec` is not a dictionary or a tuple whose first element is a dictionary.
Returns:
JIDENNDataset: The JIDENNDataset object with set dataset and `element_spec`.
"""
if element_spec_path is None:
element_spec_path = os.path.join(path, 'element_spec')
with open(element_spec_path, 'rb') as f:
element_spec = pickle.load(f)
if isinstance(element_spec, dict):
variables = list(element_spec.keys())
elif isinstance(element_spec[0], dict):
variables = list(element_spec[0].keys())
else:
raise ValueError('Element spec is not a dictionary.')
return JIDENNDataset(variables=variables).load_dataset(path)
def save_dataset(self, file: str, num_shards: Optional[int] = None) -> None:
"""Saves the dataset to a file. The dataset is stored in the `tf.data.Dataset` format.
The `element_spec` is stored in the `element_spec` file inside the dataset directory.
Tensorflow saves the `element_spec.pb` automatically, but manual save is required
for further processing of the dataset. Ternsorflow file has the `.pb` extension.
Args:
file (str): The path to the dataset directory.
num_shards (int, optional): The number of shards to split the dataset into. Defaults to `None`. The sharding is done uniformly into `num_shards` files.
Raises:
ValueError: If the dataset is not loaded yet.
Returns:
None
"""
if self.dataset is None:
raise ValueError('Dataset not loaded yet.')
@tf.function
def random_shards(_) -> tf.Tensor:
return tf.random.uniform(shape=[], minval=0, maxval=num_shards, dtype=tf.int64)
self.dataset.save(file, compression='GZIP',
shard_func=random_shards if num_shards is not None else None)
with open(os.path.join(file, 'element_spec'), 'wb') as f:
pickle.dump(self.dataset.element_spec, f)
def load_element_spec(self, file: str) -> JIDENNDataset:
"""Loads the `element_spec` from a file. The `element_spec` is a pickled dictionary of `tf.TensorSpec` or `tf.RaggedTensorSpec` objects.
Args:
file (str): The path to the `element_spec` file.
Returns:
JIDENNDataset: The JIDENNDataset object with the `element_spec` set.
"""
with open(file, 'rb') as f:
element_spec = pickle.load(f)
return self._set_element_spec(element_spec)
def create_variables(self, cut: Optional[Cut] = None, map_dataset: Optional[Callable[[ROOTVariables], ROOTVariables]] = None) -> JIDENNDataset:
"""Creates a 'tf.data.Dataset' from selected variables and creates labels and weights.
The variables are selected according to the `variables` loaded from config.
the `target` and `weight` class variables are used to create labels and weights from the `ROOTVariables`.
Optionally, a `Cut` can be applied to the dataset. It is done **before** the variables are selected.
The `map_dataset` function can be used to apply a function to the dataset before the variables are selected.
It could be used to create new variables from the existing ones.
Args:
cut (jidenn.data.utils.Cut.Cut, optional): The `Cut` object to be applied to the dataset. Defaults to `None`.
map_dataset (Callable[[ROOTVariables], ROOTVariables], optional): The function to be applied to the dataset using `tf.data.Dataset.map`. Defaults to `None`.
Raises:
ValueError: If the dataset is not loaded yet.
Returns:
JIDENNDataset: The JIDENNDataset object with the signature of `(ROOTVariables, label, weight)`.
"""
if self.dataset is None:
raise ValueError('Dataset not loaded yet.')
if map_dataset is not None:
dataset = self.dataset.map(map_dataset)
else:
dataset = self.dataset
dataset = dataset.filter(cut) if cut is not None else dataset
dataset = dataset.map(self._var_picker)
return self._set_dataset(dataset)
def remap_labels(self, label_mapping: Callable[[int], int]) -> JIDENNDataset:
"""Remaps the labels in the dataset using the `label_mapping` function.
Should be used after the `create_variables` method.
Args:
label_mapping (Callable[[int], int]): The function that maps the labels.
Raises:
ValueError: If the dataset is not loaded yet.
ValueError: If the `target` is not set.
Returns:
JIDENNDataset: The JIDENNDataset object where the `label` is remapped.
"""
if self.dataset is None:
raise ValueError('Dataset not loaded yet.')
if self.target is None:
raise ValueError('Target not set yet.')
if self.weight is not None:
@tf.function
def remap_label(x, y, w):
return x, label_mapping(y), w
else:
@tf.function
def remap_label(x, y):
return x, label_mapping(y)
dataset = self.dataset.map(remap_label)
return self._set_dataset(dataset)
@property
def dataset(self) -> Union[tf.data.Dataset, None]:
"""The `tf.data.Dataset` object or `None` if the dataset is not set yet."""
return self._dataset
@property
def element_spec(self) -> Union[Dict[str, Union[tf.TensorSpec, tf.RaggedTensorSpec]], None]:
"""The `element_spec` of the dataset or `None` if the dataset is not set yet."""
return self._element_spec
def _set_element_spec(self, element_spec: Dict[str, Union[tf.TensorSpec, tf.RaggedTensorSpec]]) -> JIDENNDataset:
jidenn_dataset = JIDENNDataset(variables=self.variables,
target=self.target,
weight=self.weight)
self._element_spec = element_spec
self._dataset = self.dataset
return jidenn_dataset
def _set_dataset(self, dataset: Union[tf.data.Dataset, None]) -> JIDENNDataset:
jidenn_dataset = JIDENNDataset(variables=self.variables,
target=self.target,
weight=self.weight)
jidenn_dataset._dataset = dataset
jidenn_dataset._element_spec = dataset.element_spec
return jidenn_dataset
def set_dataset(self, dataset: tf.data.Dataset, element_spec: Dict[str, Union[tf.TensorSpec, tf.RaggedTensorSpec]]) -> JIDENNDataset:
"""Sets the `tf.data.Dataset` object and the `element_spec` of the dataset.
Args:
dataset (tf.data.Dataset): The `tf.data.Dataset` object consisting of `ROOTVariables`.
element_spec (Dict[str, Union[tf.TensorSpec, tf.RaggedTensorSpec]]): The `element_spec` of the dataset.
Returns:
JIDENNDataset: The JIDENNDataset object with the `dataset` and `element_spec` set.
"""
jidenn_dataset = JIDENNDataset(variables=self.variables,
target=self.target,
weight=self.weight)
jidenn_dataset._dataset = dataset
jidenn_dataset._element_spec = element_spec
return jidenn_dataset
@property
def _var_picker(self):
@tf.function
def _pick_variables(sample: ROOTVariables) -> Union[Tuple[ROOTVariables, tf.RaggedTensor, tf.RaggedTensor], ROOTVariables, Tuple[ROOTVariables, tf.RaggedTensor]]:
new_sample = {var: Expression(var)(sample)
for var in self.variables}
if self.target is None:
return new_sample
if self.weight is None:
return new_sample, Expression(self.target)(sample)
else:
return new_sample, Expression(self.target)(sample), Expression(self.weight)(sample)
return _pick_variables
def resample_dataset(self, resampling_func: Callable[[ROOTVariables, Any], int], target_dist: List[float]):
"""Resamples the dataset using the `resampling_func` function. The function computes the bin index for each sample in the dataset.
The dataset is then resampled to match the `target_dist` distribution. Be careful that this may **slow down the training process**,
if the target distribution is very different from the original one as the dataset is resampled on the fly and is waiting
for the appropriate sample to be drawn.
Args:
resampling_func (Callable[[ROOTVariables, Any], int]): Function that bins the data. It must return an integer between 0 and `len(target_dist) - 1`.
target_dist (List[float]): The target distribution of the resampled dataset.
Raises:
ValueError: If the dataset is not loaded yet.
Returns:
JIDENNDataset: The JIDENNDataset object where the dataset is resampled.
"""
if self.dataset is None:
raise ValueError('Dataset not loaded yet.')
@tf.function
def _data_only(x, data):
return data
dataset = self.dataset.rejection_resample(
resampling_func, target_dist=target_dist).map(_data_only)
return self._set_dataset(dataset)
@staticmethod
def combine(datasets: List[JIDENNDataset], weights: List[float]) -> JIDENNDataset:
"""Combines multiple datasets into one dataset. The samples are interleaved and the weights are used to sample from the datasets.
Args:
datasets (List[JIDENNDataset]): List of datasets to combined. All `JIDENNDataset.dataset`s must be set and have the same `element_spec`.
weights (List[float]): List of weights for each dataset. The weights are used to sample from the datasets.
Returns:
JIDENNDataset: Combined `JIDENNDataset` object.
"""
dataset = tf.data.Dataset.sample_from_datasets(
[dataset.dataset for dataset in datasets], weights=weights)
jidenn_dataset = JIDENNDataset(
datasets[0].variables, datasets[0].target, datasets[0].weight)
return jidenn_dataset._set_dataset(dataset)
def apply(self, func: Callable[[tf.data.Dataset], tf.data.Dataset]) -> JIDENNDataset:
"""Applies a function to the dataset.
Args:
func (Callable[[tf.data.Dataset], tf.data.Dataset]): Function to apply to the dataset.
Returns:
JIDENNDataset: The JIDENNDataset object with the dataset modified by the function.
"""
if self.dataset is None:
raise ValueError('Dataset not loaded yet.')
dataset = func(self.dataset)
return self._set_dataset(dataset)
def create_train_input(self, func: Callable[[ROOTVariables], Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]]]) -> JIDENNDataset:
"""Creates a training input from the dataset using the `func` function. The function must take a `ROOTVariables` object and return a `ROOTVariables` object.
The output of the function is of the form Dict[str, tf.Tensor] or Tuple[Dict[str, tf.Tensor], Dict[str, tf.Tensor]] (optionally aslo tf.RaggedTensor).
Args:
func (Callable[[ROOTVariables], Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]]]): Function to apply to the data to create the training input.
Raises:
ValueError: If the dataset is not loaded yet.
Returns:
JIDENNDataset: The JIDENNDataset object with signature `((ROOTVariables, ROOTVariables), ...)` or `(ROOTVariables, ...)`.
"""
if self.dataset is None:
raise ValueError('Dataset not loaded yet.')
@tf.function
def input_wrapper(data, label, w=None):
return func(data), label
dataset = self.dataset.map(input_wrapper)
return self._set_dataset(dataset)
def to_pandas(self, variables: Optional[List[str]] = None) -> pd.DataFrame:
"""Converts the dataset to a pandas DataFrame. The dataset must be loaded before calling this function.
The function uses `tensorflow_datasets.as_dataframe` to convert the dataset to a pandas DataFrame, so
the `tensorflow_datasets` package must be installed.
Be careful that this function may take a **long time to run**, depending on the size of the dataset.
Consider taking only a subset of the dataset before converting it to a pandas DataFrame.
```python
jidenn_dataset = JIDENNDataset(...)
...
jidenn_dataset = jidenn_dataset.apply(lambda dataset: dataset.take(1_000))
df = jidenn_dataset.to_pandas()
```
If the dataset contains nested tuples consider using `jidenn.data.data_info.explode_nested_variables`
on the tuple columns of the convereted dataframe.
Args:
variables (Optional[List[str]], optional): List of variables to convert to a pandas DataFrame. If `None`, all variables are converted. Defaults to `None`.
Raises:
ImportError: If `tensorflow_datasets` is not installed.
ValueError: If the dataset is not loaded yet.
Returns:
pd.DataFrame: The `tf.data.Dataset` converted to a pandas `pd.DataFrame`.
"""
try:
import tensorflow_datasets as tfds
except ImportError:
raise ImportError(
'Please install tensorflow_datasets to use this function. Use `pip install tensorflow_datasets`.')
if self.dataset is None:
raise ValueError('Dataset not loaded yet.')
if isinstance(self.element_spec, tuple) and variables is None:
@tf.function
def tuple_to_dict(data, label, weight=None):
if isinstance(data, tuple):
data = {**data[0], **data[1]}
data = {**data, 'label': label, 'weight': weight}
return data
elif isinstance(self.element_spec, tuple) and variables is not None:
@tf.function
def tuple_to_dict(data, label, weight=None):
if isinstance(data, tuple):
data = {**data[0], **data[1]}
data = {**data, 'label': label, 'weight': weight}
return {k: data[k] for k in variables + ['label', 'weight']}
elif isinstance(self.element_spec, dict) and variables is not None:
@tf.function
def tuple_to_dict(data):
return {k: data[k] for k in variables}
elif isinstance(self.element_spec, dict) and variables is None:
@tf.function
def tuple_to_dict(data):
return data
else:
raise ValueError('The dataset must be a tuple or a dict.')
dataset = self.dataset.map(tuple_to_dict)
df = tfds.as_dataframe(dataset)
df = df.rename(lambda x: x.replace('/', '.'), axis='columns')
return df
def filter(self, filter: Callable[[ROOTVariables], bool]) -> JIDENNDataset:
"""Filters the dataset using the `filter` function.
Args:
filter (Callable[[ROOTVariables], bool]): Function to apply to the data.
Raises:
ValueError: If the dataset is not loaded yet.
Returns:
JIDENNDataset: The JIDENNDataset object with the dataset filtered.
"""
if self.dataset is None:
raise ValueError('Dataset not loaded yet.')
dataset = self.dataset.filter(filter)
return self._set_dataset(dataset)
def get_prepared_dataset(self,
batch_size: int,
assert_length: bool = False,
shuffle_buffer_size: Optional[int] = None,
take: Optional[int] = None,
map_func: Optional[Callable[[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any], Tuple[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any]]] = None) -> tf.data.Dataset:
"""Returns a prepared dataset for training. The dataset is prepared by stacking the arrays in the `ROOTVariables` in the dataset using `dict_to_stacked_array`.
The dataset is also batched, shuffled, shortend (using `take`) and mapped using the `map_func` function. The function is applied before the input is stacked.
**Train input must be created with `JIDENNDataset.create_train_input` before calling this method.**
The assertion allows displaying the estimated epoch time during training. The assertion is only performed if `take` is set.
Args:
batch_size (int): Batch size of the dataset.
assert_length (bool, optional): If `True`, the dataset is asserted to have the `take` length. It is only used if 'take' is set. Defaults to False.
shuffle_buffer_size (int, optional): Size of the shuffle buffer. If `None`, the dataset is not shuffled. Defaults to None.
take (int, optional): Number of elements to take from the dataset. If `None`, the dataset is not taken. Defaults to None.
map_func (Callable[[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any], Tuple[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any]], optional): Function to apply to the dataset. Defaults to None.
Raises:
ValueError: If the dataset is not loaded yet.
Returns:
tf.data.Dataset: The prepared dataset.
"""
if self.dataset is None:
raise ValueError('Dataset not loaded yet.')
if map_func is not None:
dataset = self.dataset.map(map_func)
else:
dataset = self.dataset.map(dict_to_stacked_array)
dataset = dataset.shuffle(
shuffle_buffer_size) if shuffle_buffer_size is not None else dataset
if take is not None:
dataset = dataset.take(take)
dataset = dataset.apply(tf.data.experimental.assert_cardinality(
take)) if assert_length else dataset
dataset = dataset.apply(
tf.data.experimental.dense_to_ragged_batch(batch_size))
# dataset = dataset.ragged_batch(batch_size)
dataset = dataset.prefetch(tf.data.AUTOTUNE)
return dataset
def plot_data_distributions(self,
folder: str,
variables: Optional[List[str]] = None,
hue_variable: Optional[str] = None,
named_labels: Optional[Dict[int, str]] = None,
xlabel_mapper: Optional[Dict[str, str]] = None) -> None:
"""Plots the data distributions of the dataset. The dataset must be loaded before calling this function.
The function uses `jidenn.evaluation.plotter.plot_data_distributions` to plot the data distributions.
Args:
folder (str): The path to the directory where the plots are saved.
variables (Optional[List[str]], optional): List of variables to plot. If `None`, all variables are plotted. Defaults to `None`.
named_labels (Dict[int, str], optional): Dictionary mapping truth values to custom labels.
If not provided, the truth values will be used as labels.
Raises:
ValueError: If the dataset is not loaded yet.
Returns:
None
"""
if self.dataset is None:
raise ValueError('Dataset not loaded yet.')
df = self.to_pandas(variables)
plot_data_distributions(df, folder=folder, named_labels=named_labels,
xlabel_mapper=xlabel_mapper, hue_variable=hue_variable)
Global variables
var ROOTVariables
-
Type alias for a dictionary of ROOT variables. The keys are the variable names and the values are the corresponding Tensorflow
tf.RaggedTensor
ortf.Tensor
.Example:
variables = { 'jets_pt': tf.RaggedTensor([[1, 2, 3, 4, 5], [2, 3]], dtype=tf.float32), 'eventNumber': tf.Tensor([1, 2], dtype=tf.int32), ... }
Functions
def dict_to_stacked_array(data: Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], label: int, weight: Optional[float] = None) ‑> Tuple[Union[tensorflow.python.framework.ops.Tensor, Tuple[tensorflow.python.framework.ops.Tensor, tensorflow.python.framework.ops.Tensor]], int, Optional[float]]
-
Converts a
ROOTVariables
to a input for training a neural network, i.e. a tuple(input, label, weight)
. Theinput
is construsted by stacking all the variables indata
ROOTVariables
dictionary into a singletf.Tensor
.Optionally, the input data can be a tuple of two ROOTVariables. The output has the form
((input1, input2), label, weight)
. Theinput2
is constructed by stacking all the variables in the secondROOTVariables
dictionary into a singletf.Tensor
.Args
data
:ROOTVariables
ortuple[ROOTVariables, ROOTVariables]
- The input data.
label
:int
- The label.
weight
:float
, optional- The weight. Defaults to
None
.
Returns
A tuple
(input, label, weight)
wheredata
is atf.Tensor
or a tuple((input1, input2), label, weight)
in casedata
is a tuple of two ROOTVariables whereinput1
andinput2
aretf.Tensor
s.Expand source code
@tf.function def dict_to_stacked_array(data: Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], label: int, weight: Optional[float] = None) -> Tuple[Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]], int, Union[float, None]]: """Converts a `ROOTVariables` to a input for training a neural network, i.e. a tuple `(input, label, weight)`. The `input` is construsted by **stacking all the variables** in `data` `ROOTVariables` dictionary into a single `tf.Tensor`. Optionally, the input data can be a tuple of two ROOTVariables. The output has the form `((input1, input2), label, weight)`. The `input2` is constructed by **stacking all the variables** in the second `ROOTVariables` dictionary into a single `tf.Tensor`. Args: data (ROOTVariables or tuple[ROOTVariables, ROOTVariables]): The input data. label (int): The label. weight (float, optional): The weight. Defaults to `None`. Returns: A tuple `(input, label, weight)` where `data` is a `tf.Tensor` or a tuple `((input1, input2), label, weight)` in case `data` is a tuple of two ROOTVariables where `input1` and `input2` are `tf.Tensor`s. """ if isinstance(data, tuple): interaction = tf.stack([data[1][var] for var in data[1]], axis=-1) interaction = tf.where(tf.math.logical_or(tf.math.is_inf(interaction), tf.math.is_nan(interaction)), tf.zeros_like(interaction), interaction) if weight is None: return (tf.stack([data[0][var] for var in data[0]], axis=-1), interaction), label return (tf.stack([data[0][var] for var in data[0]], axis=-1), interaction), label, weight else: if weight is None: return tf.stack([data[var] for var in data.keys()], axis=-1), label return tf.stack([data[var] for var in data.keys()], axis=-1), label, weight
Classes
class JIDENNDataset (variables: Optional[List[str]] = None, target: Optional[str] = None, weight: Optional[str] = None)
-
The JIDENNDataset dataclass is a wrapper for a TensorFlow dataset that allows for easy loading and processing of dataset files for jet identifiation using deep neural networks (JIDENN). The
tf.data.Dataset
is constructed from atf.data.Dataset
consisting ofROOTVariables
dictionaries.The dataset can be loaded from a file using the
load_dataset
method or set manually using theset_dataset
method. Both methods require theelement_spec
either in a pickled file in the case of loading, or as dictionary oftf.TensorSpec
ortf.RaggedTensorSpec
object in the case of setting the dataset manually.Example: Typical usage of the
JIDENNDataset
dataclass is as follows:import tensorflow as tf from jidenn.config.config_subclasses import Variables from .utils.Cut import Cut @tf.function def count_PFO(sample: ROOTVariables) -> ROOTVariables: sample = sample.copy() sample['jets_PFO_n'] = tf.reduce_sum(tf.ones_like(sample['jets_PFO_pt'])) return sample @tf.function def train_input(sample: ROOTVariables) -> ROOTVariables: output = { 'N_PFO': sample['jets_PFO_n'], 'pt': sample['jets_pt'], 'width': sample['jets_Width'], 'EMFrac': sample['jets_EMFrac'], 'mu': sample['corrected_averageInteractionsPerCrossing[0]'] } return output variables = ['corrected_averageInteractionsPerCrossing[0]', 'jets_pt', 'jets_Width', 'jets_EMFrac','jets_PFO_pt'] jidenn_dataset = JIDENNDataset(variables=variables, target='jets_TruthLabelID', weight=None) jidenn_dataset = jidenn_dataset.load_dataset('path/to/dataset') jidenn_dataset = jidenn_dataset.create_variables(cut=Cut('jets_pt > 10_000'), map_dataset=count_PFO) jidenn_dataset = jidenn_dataset.resample_dataset(lambda data, label: tf.cast(tf.greater(label, 0), tf.int32), [0.5, 0.5]) jidenn_dataset = jidenn_dataset.remap_labels(lambda data, label: tf.cast(tf.greater(label, 0), tf.int32)) jidenn_dataset = jidenn_dataset.create_train_input(train_input) dataset = jidenn_dataset.get_prepared_dataset(batch_size=128, shuffle_buffer_size=1000, take=100_000, assert_length=True) model.fit(dataset, epochs=10)
Args
variables
:List[str]
- The list of variables to be used in the dataset.
target
:str
, optional- The name of the target variable. Defaults to
None
. weight
:str
, optional- The name of the weight variable. Defaults to
None
.
Expand source code
@dataclass class JIDENNDataset: """The JIDENNDataset dataclass is a wrapper for a TensorFlow dataset that allows for easy loading and processing of dataset files for jet identifiation using deep neural networks (**JIDENN**). The `tf.data.Dataset` is constructed from a `tf.data.Dataset` consisting of `ROOTVariables` dictionaries. The dataset can be loaded from a file using the `load_dataset` method or set manually using the `set_dataset` method. Both methods require the `element_spec` either in a pickled file in the case of loading, or as dictionary of `tf.TensorSpec` or `tf.RaggedTensorSpec` object in the case of setting the dataset manually. Example: Typical usage of the `JIDENNDataset` dataclass is as follows: ```python import tensorflow as tf from jidenn.config.config_subclasses import Variables from .utils.Cut import Cut @tf.function def count_PFO(sample: ROOTVariables) -> ROOTVariables: sample = sample.copy() sample['jets_PFO_n'] = tf.reduce_sum(tf.ones_like(sample['jets_PFO_pt'])) return sample @tf.function def train_input(sample: ROOTVariables) -> ROOTVariables: output = { 'N_PFO': sample['jets_PFO_n'], 'pt': sample['jets_pt'], 'width': sample['jets_Width'], 'EMFrac': sample['jets_EMFrac'], 'mu': sample['corrected_averageInteractionsPerCrossing[0]'] } return output variables = ['corrected_averageInteractionsPerCrossing[0]', 'jets_pt', 'jets_Width', 'jets_EMFrac','jets_PFO_pt'] jidenn_dataset = JIDENNDataset(variables=variables, target='jets_TruthLabelID', weight=None) jidenn_dataset = jidenn_dataset.load_dataset('path/to/dataset') jidenn_dataset = jidenn_dataset.create_variables(cut=Cut('jets_pt > 10_000'), map_dataset=count_PFO) jidenn_dataset = jidenn_dataset.resample_dataset(lambda data, label: tf.cast(tf.greater(label, 0), tf.int32), [0.5, 0.5]) jidenn_dataset = jidenn_dataset.remap_labels(lambda data, label: tf.cast(tf.greater(label, 0), tf.int32)) jidenn_dataset = jidenn_dataset.create_train_input(train_input) dataset = jidenn_dataset.get_prepared_dataset(batch_size=128, shuffle_buffer_size=1000, take=100_000, assert_length=True) model.fit(dataset, epochs=10) ``` Args: variables (List[str]): The list of variables to be used in the dataset. target (str, optional): The name of the target variable. Defaults to `None`. weight (str, optional): The name of the weight variable. Defaults to `None`. """ variables: Optional[List[str]] = None """The configuration dataclass of the variables to be used in the dataset. If `None`, the variables are set automatically during loading with JIDENNDataset.load().""" target: Optional[str] = None """The name of the target variable. `None` if no target variable is used.""" weight: Optional[str] = None """The name of the weight variable. `None` if no weight variable is used.""" def __post_init__(self): self._dataset = None self._element_spec = None def load_dataset(self, file: str) -> JIDENNDataset: """Loads a dataset from a file. The dataset is stored in the `tf.data.Dataset` format. The `element_spec` is loaded from the `element_spec` file inside the dataset directory. Alternatively, the `element_spec` can be loaded manually using the `load_element_spec` method. Args: file (str): The path to the dataset directory. Returns: JIDENNDataset: The JIDENNDataset object with set dataset and `element_spec`. """ if self.element_spec is None: element_spec_file = os.path.join(file, 'element_spec') jidenn_dataset = self.load_element_spec(element_spec_file) else: jidenn_dataset = self dataset = tf.data.Dataset.load( file, compression='GZIP', element_spec=jidenn_dataset.element_spec) return jidenn_dataset._set_dataset(dataset) @staticmethod def load(path: str, element_spec_path: Optional[str] = None) -> JIDENNDataset: """Loads a dataset from a file. The dataset is stored in the `tf.data.Dataset` format. The assumed dataset elements are `ROOTVariables` dictionaries or a tuple of `ROOTVariables`, `label` and `weight`. Args: path (str): The path to the dataset directory. element_spec_path (str, optional): The path to the `element_spec` file. Defaults to `None`. If `None`, the `element_spec` is loaded from the `element_spec` file inside the dataset directory. Raises: ValueError: If the `element_spec` is not a dictionary or a tuple whose first element is a dictionary. Returns: JIDENNDataset: The JIDENNDataset object with set dataset and `element_spec`. """ if element_spec_path is None: element_spec_path = os.path.join(path, 'element_spec') with open(element_spec_path, 'rb') as f: element_spec = pickle.load(f) if isinstance(element_spec, dict): variables = list(element_spec.keys()) elif isinstance(element_spec[0], dict): variables = list(element_spec[0].keys()) else: raise ValueError('Element spec is not a dictionary.') return JIDENNDataset(variables=variables).load_dataset(path) def save_dataset(self, file: str, num_shards: Optional[int] = None) -> None: """Saves the dataset to a file. The dataset is stored in the `tf.data.Dataset` format. The `element_spec` is stored in the `element_spec` file inside the dataset directory. Tensorflow saves the `element_spec.pb` automatically, but manual save is required for further processing of the dataset. Ternsorflow file has the `.pb` extension. Args: file (str): The path to the dataset directory. num_shards (int, optional): The number of shards to split the dataset into. Defaults to `None`. The sharding is done uniformly into `num_shards` files. Raises: ValueError: If the dataset is not loaded yet. Returns: None """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') @tf.function def random_shards(_) -> tf.Tensor: return tf.random.uniform(shape=[], minval=0, maxval=num_shards, dtype=tf.int64) self.dataset.save(file, compression='GZIP', shard_func=random_shards if num_shards is not None else None) with open(os.path.join(file, 'element_spec'), 'wb') as f: pickle.dump(self.dataset.element_spec, f) def load_element_spec(self, file: str) -> JIDENNDataset: """Loads the `element_spec` from a file. The `element_spec` is a pickled dictionary of `tf.TensorSpec` or `tf.RaggedTensorSpec` objects. Args: file (str): The path to the `element_spec` file. Returns: JIDENNDataset: The JIDENNDataset object with the `element_spec` set. """ with open(file, 'rb') as f: element_spec = pickle.load(f) return self._set_element_spec(element_spec) def create_variables(self, cut: Optional[Cut] = None, map_dataset: Optional[Callable[[ROOTVariables], ROOTVariables]] = None) -> JIDENNDataset: """Creates a 'tf.data.Dataset' from selected variables and creates labels and weights. The variables are selected according to the `variables` loaded from config. the `target` and `weight` class variables are used to create labels and weights from the `ROOTVariables`. Optionally, a `Cut` can be applied to the dataset. It is done **before** the variables are selected. The `map_dataset` function can be used to apply a function to the dataset before the variables are selected. It could be used to create new variables from the existing ones. Args: cut (jidenn.data.utils.Cut.Cut, optional): The `Cut` object to be applied to the dataset. Defaults to `None`. map_dataset (Callable[[ROOTVariables], ROOTVariables], optional): The function to be applied to the dataset using `tf.data.Dataset.map`. Defaults to `None`. Raises: ValueError: If the dataset is not loaded yet. Returns: JIDENNDataset: The JIDENNDataset object with the signature of `(ROOTVariables, label, weight)`. """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') if map_dataset is not None: dataset = self.dataset.map(map_dataset) else: dataset = self.dataset dataset = dataset.filter(cut) if cut is not None else dataset dataset = dataset.map(self._var_picker) return self._set_dataset(dataset) def remap_labels(self, label_mapping: Callable[[int], int]) -> JIDENNDataset: """Remaps the labels in the dataset using the `label_mapping` function. Should be used after the `create_variables` method. Args: label_mapping (Callable[[int], int]): The function that maps the labels. Raises: ValueError: If the dataset is not loaded yet. ValueError: If the `target` is not set. Returns: JIDENNDataset: The JIDENNDataset object where the `label` is remapped. """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') if self.target is None: raise ValueError('Target not set yet.') if self.weight is not None: @tf.function def remap_label(x, y, w): return x, label_mapping(y), w else: @tf.function def remap_label(x, y): return x, label_mapping(y) dataset = self.dataset.map(remap_label) return self._set_dataset(dataset) @property def dataset(self) -> Union[tf.data.Dataset, None]: """The `tf.data.Dataset` object or `None` if the dataset is not set yet.""" return self._dataset @property def element_spec(self) -> Union[Dict[str, Union[tf.TensorSpec, tf.RaggedTensorSpec]], None]: """The `element_spec` of the dataset or `None` if the dataset is not set yet.""" return self._element_spec def _set_element_spec(self, element_spec: Dict[str, Union[tf.TensorSpec, tf.RaggedTensorSpec]]) -> JIDENNDataset: jidenn_dataset = JIDENNDataset(variables=self.variables, target=self.target, weight=self.weight) self._element_spec = element_spec self._dataset = self.dataset return jidenn_dataset def _set_dataset(self, dataset: Union[tf.data.Dataset, None]) -> JIDENNDataset: jidenn_dataset = JIDENNDataset(variables=self.variables, target=self.target, weight=self.weight) jidenn_dataset._dataset = dataset jidenn_dataset._element_spec = dataset.element_spec return jidenn_dataset def set_dataset(self, dataset: tf.data.Dataset, element_spec: Dict[str, Union[tf.TensorSpec, tf.RaggedTensorSpec]]) -> JIDENNDataset: """Sets the `tf.data.Dataset` object and the `element_spec` of the dataset. Args: dataset (tf.data.Dataset): The `tf.data.Dataset` object consisting of `ROOTVariables`. element_spec (Dict[str, Union[tf.TensorSpec, tf.RaggedTensorSpec]]): The `element_spec` of the dataset. Returns: JIDENNDataset: The JIDENNDataset object with the `dataset` and `element_spec` set. """ jidenn_dataset = JIDENNDataset(variables=self.variables, target=self.target, weight=self.weight) jidenn_dataset._dataset = dataset jidenn_dataset._element_spec = element_spec return jidenn_dataset @property def _var_picker(self): @tf.function def _pick_variables(sample: ROOTVariables) -> Union[Tuple[ROOTVariables, tf.RaggedTensor, tf.RaggedTensor], ROOTVariables, Tuple[ROOTVariables, tf.RaggedTensor]]: new_sample = {var: Expression(var)(sample) for var in self.variables} if self.target is None: return new_sample if self.weight is None: return new_sample, Expression(self.target)(sample) else: return new_sample, Expression(self.target)(sample), Expression(self.weight)(sample) return _pick_variables def resample_dataset(self, resampling_func: Callable[[ROOTVariables, Any], int], target_dist: List[float]): """Resamples the dataset using the `resampling_func` function. The function computes the bin index for each sample in the dataset. The dataset is then resampled to match the `target_dist` distribution. Be careful that this may **slow down the training process**, if the target distribution is very different from the original one as the dataset is resampled on the fly and is waiting for the appropriate sample to be drawn. Args: resampling_func (Callable[[ROOTVariables, Any], int]): Function that bins the data. It must return an integer between 0 and `len(target_dist) - 1`. target_dist (List[float]): The target distribution of the resampled dataset. Raises: ValueError: If the dataset is not loaded yet. Returns: JIDENNDataset: The JIDENNDataset object where the dataset is resampled. """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') @tf.function def _data_only(x, data): return data dataset = self.dataset.rejection_resample( resampling_func, target_dist=target_dist).map(_data_only) return self._set_dataset(dataset) @staticmethod def combine(datasets: List[JIDENNDataset], weights: List[float]) -> JIDENNDataset: """Combines multiple datasets into one dataset. The samples are interleaved and the weights are used to sample from the datasets. Args: datasets (List[JIDENNDataset]): List of datasets to combined. All `JIDENNDataset.dataset`s must be set and have the same `element_spec`. weights (List[float]): List of weights for each dataset. The weights are used to sample from the datasets. Returns: JIDENNDataset: Combined `JIDENNDataset` object. """ dataset = tf.data.Dataset.sample_from_datasets( [dataset.dataset for dataset in datasets], weights=weights) jidenn_dataset = JIDENNDataset( datasets[0].variables, datasets[0].target, datasets[0].weight) return jidenn_dataset._set_dataset(dataset) def apply(self, func: Callable[[tf.data.Dataset], tf.data.Dataset]) -> JIDENNDataset: """Applies a function to the dataset. Args: func (Callable[[tf.data.Dataset], tf.data.Dataset]): Function to apply to the dataset. Returns: JIDENNDataset: The JIDENNDataset object with the dataset modified by the function. """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') dataset = func(self.dataset) return self._set_dataset(dataset) def create_train_input(self, func: Callable[[ROOTVariables], Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]]]) -> JIDENNDataset: """Creates a training input from the dataset using the `func` function. The function must take a `ROOTVariables` object and return a `ROOTVariables` object. The output of the function is of the form Dict[str, tf.Tensor] or Tuple[Dict[str, tf.Tensor], Dict[str, tf.Tensor]] (optionally aslo tf.RaggedTensor). Args: func (Callable[[ROOTVariables], Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]]]): Function to apply to the data to create the training input. Raises: ValueError: If the dataset is not loaded yet. Returns: JIDENNDataset: The JIDENNDataset object with signature `((ROOTVariables, ROOTVariables), ...)` or `(ROOTVariables, ...)`. """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') @tf.function def input_wrapper(data, label, w=None): return func(data), label dataset = self.dataset.map(input_wrapper) return self._set_dataset(dataset) def to_pandas(self, variables: Optional[List[str]] = None) -> pd.DataFrame: """Converts the dataset to a pandas DataFrame. The dataset must be loaded before calling this function. The function uses `tensorflow_datasets.as_dataframe` to convert the dataset to a pandas DataFrame, so the `tensorflow_datasets` package must be installed. Be careful that this function may take a **long time to run**, depending on the size of the dataset. Consider taking only a subset of the dataset before converting it to a pandas DataFrame. ```python jidenn_dataset = JIDENNDataset(...) ... jidenn_dataset = jidenn_dataset.apply(lambda dataset: dataset.take(1_000)) df = jidenn_dataset.to_pandas() ``` If the dataset contains nested tuples consider using `jidenn.data.data_info.explode_nested_variables` on the tuple columns of the convereted dataframe. Args: variables (Optional[List[str]], optional): List of variables to convert to a pandas DataFrame. If `None`, all variables are converted. Defaults to `None`. Raises: ImportError: If `tensorflow_datasets` is not installed. ValueError: If the dataset is not loaded yet. Returns: pd.DataFrame: The `tf.data.Dataset` converted to a pandas `pd.DataFrame`. """ try: import tensorflow_datasets as tfds except ImportError: raise ImportError( 'Please install tensorflow_datasets to use this function. Use `pip install tensorflow_datasets`.') if self.dataset is None: raise ValueError('Dataset not loaded yet.') if isinstance(self.element_spec, tuple) and variables is None: @tf.function def tuple_to_dict(data, label, weight=None): if isinstance(data, tuple): data = {**data[0], **data[1]} data = {**data, 'label': label, 'weight': weight} return data elif isinstance(self.element_spec, tuple) and variables is not None: @tf.function def tuple_to_dict(data, label, weight=None): if isinstance(data, tuple): data = {**data[0], **data[1]} data = {**data, 'label': label, 'weight': weight} return {k: data[k] for k in variables + ['label', 'weight']} elif isinstance(self.element_spec, dict) and variables is not None: @tf.function def tuple_to_dict(data): return {k: data[k] for k in variables} elif isinstance(self.element_spec, dict) and variables is None: @tf.function def tuple_to_dict(data): return data else: raise ValueError('The dataset must be a tuple or a dict.') dataset = self.dataset.map(tuple_to_dict) df = tfds.as_dataframe(dataset) df = df.rename(lambda x: x.replace('/', '.'), axis='columns') return df def filter(self, filter: Callable[[ROOTVariables], bool]) -> JIDENNDataset: """Filters the dataset using the `filter` function. Args: filter (Callable[[ROOTVariables], bool]): Function to apply to the data. Raises: ValueError: If the dataset is not loaded yet. Returns: JIDENNDataset: The JIDENNDataset object with the dataset filtered. """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') dataset = self.dataset.filter(filter) return self._set_dataset(dataset) def get_prepared_dataset(self, batch_size: int, assert_length: bool = False, shuffle_buffer_size: Optional[int] = None, take: Optional[int] = None, map_func: Optional[Callable[[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any], Tuple[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any]]] = None) -> tf.data.Dataset: """Returns a prepared dataset for training. The dataset is prepared by stacking the arrays in the `ROOTVariables` in the dataset using `dict_to_stacked_array`. The dataset is also batched, shuffled, shortend (using `take`) and mapped using the `map_func` function. The function is applied before the input is stacked. **Train input must be created with `JIDENNDataset.create_train_input` before calling this method.** The assertion allows displaying the estimated epoch time during training. The assertion is only performed if `take` is set. Args: batch_size (int): Batch size of the dataset. assert_length (bool, optional): If `True`, the dataset is asserted to have the `take` length. It is only used if 'take' is set. Defaults to False. shuffle_buffer_size (int, optional): Size of the shuffle buffer. If `None`, the dataset is not shuffled. Defaults to None. take (int, optional): Number of elements to take from the dataset. If `None`, the dataset is not taken. Defaults to None. map_func (Callable[[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any], Tuple[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any]], optional): Function to apply to the dataset. Defaults to None. Raises: ValueError: If the dataset is not loaded yet. Returns: tf.data.Dataset: The prepared dataset. """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') if map_func is not None: dataset = self.dataset.map(map_func) else: dataset = self.dataset.map(dict_to_stacked_array) dataset = dataset.shuffle( shuffle_buffer_size) if shuffle_buffer_size is not None else dataset if take is not None: dataset = dataset.take(take) dataset = dataset.apply(tf.data.experimental.assert_cardinality( take)) if assert_length else dataset dataset = dataset.apply( tf.data.experimental.dense_to_ragged_batch(batch_size)) # dataset = dataset.ragged_batch(batch_size) dataset = dataset.prefetch(tf.data.AUTOTUNE) return dataset def plot_data_distributions(self, folder: str, variables: Optional[List[str]] = None, hue_variable: Optional[str] = None, named_labels: Optional[Dict[int, str]] = None, xlabel_mapper: Optional[Dict[str, str]] = None) -> None: """Plots the data distributions of the dataset. The dataset must be loaded before calling this function. The function uses `jidenn.evaluation.plotter.plot_data_distributions` to plot the data distributions. Args: folder (str): The path to the directory where the plots are saved. variables (Optional[List[str]], optional): List of variables to plot. If `None`, all variables are plotted. Defaults to `None`. named_labels (Dict[int, str], optional): Dictionary mapping truth values to custom labels. If not provided, the truth values will be used as labels. Raises: ValueError: If the dataset is not loaded yet. Returns: None """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') df = self.to_pandas(variables) plot_data_distributions(df, folder=folder, named_labels=named_labels, xlabel_mapper=xlabel_mapper, hue_variable=hue_variable)
Class variables
var target : Optional[str]
-
The name of the target variable.
None
if no target variable is used. var variables : Optional[List[str]]
-
The configuration dataclass of the variables to be used in the dataset. If
None
, the variables are set automatically during loading with JIDENNDataset.load(). var weight : Optional[str]
-
The name of the weight variable.
None
if no weight variable is used.
Static methods
def combine(datasets: List[JIDENNDataset], weights: List[float]) ‑> JIDENNDataset
-
Combines multiple datasets into one dataset. The samples are interleaved and the weights are used to sample from the datasets.
Args
datasets
:List[JIDENNDataset]
- List of datasets to combined. All
JIDENNDataset.dataset
s must be set and have the sameelement_spec
. weights
:List[float]
- List of weights for each dataset. The weights are used to sample from the datasets.
Returns
JIDENNDataset
- Combined
JIDENNDataset
object.
Expand source code
@staticmethod def combine(datasets: List[JIDENNDataset], weights: List[float]) -> JIDENNDataset: """Combines multiple datasets into one dataset. The samples are interleaved and the weights are used to sample from the datasets. Args: datasets (List[JIDENNDataset]): List of datasets to combined. All `JIDENNDataset.dataset`s must be set and have the same `element_spec`. weights (List[float]): List of weights for each dataset. The weights are used to sample from the datasets. Returns: JIDENNDataset: Combined `JIDENNDataset` object. """ dataset = tf.data.Dataset.sample_from_datasets( [dataset.dataset for dataset in datasets], weights=weights) jidenn_dataset = JIDENNDataset( datasets[0].variables, datasets[0].target, datasets[0].weight) return jidenn_dataset._set_dataset(dataset)
def load(path: str, element_spec_path: Optional[str] = None) ‑> JIDENNDataset
-
Loads a dataset from a file. The dataset is stored in the
tf.data.Dataset
format. The assumed dataset elements areROOTVariables
dictionaries or a tuple ofROOTVariables
,label
andweight
.Args
path
:str
- The path to the dataset directory.
element_spec_path
:str
, optional- The path to the
element_spec
file. Defaults toNone
. IfNone
, theelement_spec
is loaded from theelement_spec
file inside the dataset directory.
Raises
ValueError
- If the
element_spec
is not a dictionary or a tuple whose first element is a dictionary.
Returns
JIDENNDataset
- The JIDENNDataset object with set dataset and
element_spec
.
Expand source code
@staticmethod def load(path: str, element_spec_path: Optional[str] = None) -> JIDENNDataset: """Loads a dataset from a file. The dataset is stored in the `tf.data.Dataset` format. The assumed dataset elements are `ROOTVariables` dictionaries or a tuple of `ROOTVariables`, `label` and `weight`. Args: path (str): The path to the dataset directory. element_spec_path (str, optional): The path to the `element_spec` file. Defaults to `None`. If `None`, the `element_spec` is loaded from the `element_spec` file inside the dataset directory. Raises: ValueError: If the `element_spec` is not a dictionary or a tuple whose first element is a dictionary. Returns: JIDENNDataset: The JIDENNDataset object with set dataset and `element_spec`. """ if element_spec_path is None: element_spec_path = os.path.join(path, 'element_spec') with open(element_spec_path, 'rb') as f: element_spec = pickle.load(f) if isinstance(element_spec, dict): variables = list(element_spec.keys()) elif isinstance(element_spec[0], dict): variables = list(element_spec[0].keys()) else: raise ValueError('Element spec is not a dictionary.') return JIDENNDataset(variables=variables).load_dataset(path)
Instance variables
var dataset : Optional[tensorflow.python.data.ops.dataset_ops.DatasetV2]
-
The
tf.data.Dataset
object orNone
if the dataset is not set yet.Expand source code
@property def dataset(self) -> Union[tf.data.Dataset, None]: """The `tf.data.Dataset` object or `None` if the dataset is not set yet.""" return self._dataset
var element_spec : Optional[Dict[str, Union[tensorflow.python.framework.tensor_spec.TensorSpec, tensorflow.python.ops.ragged.ragged_tensor.RaggedTensorSpec]]]
-
The
element_spec
of the dataset orNone
if the dataset is not set yet.Expand source code
@property def element_spec(self) -> Union[Dict[str, Union[tf.TensorSpec, tf.RaggedTensorSpec]], None]: """The `element_spec` of the dataset or `None` if the dataset is not set yet.""" return self._element_spec
Methods
def apply(self, func: Callable[[tf.data.Dataset], tf.data.Dataset]) ‑> JIDENNDataset
-
Applies a function to the dataset.
Args
func
:Callable[[tf.data.Dataset], tf.data.Dataset]
- Function to apply to the dataset.
Returns
JIDENNDataset
- The JIDENNDataset object with the dataset modified by the function.
Expand source code
def apply(self, func: Callable[[tf.data.Dataset], tf.data.Dataset]) -> JIDENNDataset: """Applies a function to the dataset. Args: func (Callable[[tf.data.Dataset], tf.data.Dataset]): Function to apply to the dataset. Returns: JIDENNDataset: The JIDENNDataset object with the dataset modified by the function. """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') dataset = func(self.dataset) return self._set_dataset(dataset)
def create_train_input(self, func: Callable[[ROOTVariables], Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]]]) ‑> JIDENNDataset
-
Creates a training input from the dataset using the
func
function. The function must take aROOTVariables
object and return aROOTVariables
object. The output of the function is of the form Dict[str, tf.Tensor] or Tuple[Dict[str, tf.Tensor], Dict[str, tf.Tensor]] (optionally aslo tf.RaggedTensor).Args
func
:Callable[[ROOTVariables], Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]]]
- Function to apply to the data to create the training input.
Raises
ValueError
- If the dataset is not loaded yet.
Returns
JIDENNDataset
- The JIDENNDataset object
with signature
((ROOTVariables, ROOTVariables), …)
or(ROOTVariables, …)
.
Expand source code
def create_train_input(self, func: Callable[[ROOTVariables], Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]]]) -> JIDENNDataset: """Creates a training input from the dataset using the `func` function. The function must take a `ROOTVariables` object and return a `ROOTVariables` object. The output of the function is of the form Dict[str, tf.Tensor] or Tuple[Dict[str, tf.Tensor], Dict[str, tf.Tensor]] (optionally aslo tf.RaggedTensor). Args: func (Callable[[ROOTVariables], Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]]]): Function to apply to the data to create the training input. Raises: ValueError: If the dataset is not loaded yet. Returns: JIDENNDataset: The JIDENNDataset object with signature `((ROOTVariables, ROOTVariables), ...)` or `(ROOTVariables, ...)`. """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') @tf.function def input_wrapper(data, label, w=None): return func(data), label dataset = self.dataset.map(input_wrapper) return self._set_dataset(dataset)
def create_variables(self, cut: Optional[Cut] = None, map_dataset: Optional[Callable[[ROOTVariables], ROOTVariables]] = None) ‑> JIDENNDataset
-
Creates a 'tf.data.Dataset' from selected variables and creates labels and weights. The variables are selected according to the
variables
loaded from config. thetarget
andweight
class variables are used to create labels and weights from theROOTVariables
.Optionally, a
Cut
can be applied to the dataset. It is done before the variables are selected. Themap_dataset
function can be used to apply a function to the dataset before the variables are selected. It could be used to create new variables from the existing ones.Args
cut
:jidenn.data.utils.Cut.Cut
, optional- The
Cut
object to be applied to the dataset. Defaults toNone
. map_dataset
:Callable[[ROOTVariables], ROOTVariables]
, optional- The function to be applied to the dataset using
tf.data.Dataset.map
. Defaults toNone
.
Raises
ValueError
- If the dataset is not loaded yet.
Returns
JIDENNDataset
- The JIDENNDataset object with the signature of
(ROOTVariables, label, weight)
.
Expand source code
def create_variables(self, cut: Optional[Cut] = None, map_dataset: Optional[Callable[[ROOTVariables], ROOTVariables]] = None) -> JIDENNDataset: """Creates a 'tf.data.Dataset' from selected variables and creates labels and weights. The variables are selected according to the `variables` loaded from config. the `target` and `weight` class variables are used to create labels and weights from the `ROOTVariables`. Optionally, a `Cut` can be applied to the dataset. It is done **before** the variables are selected. The `map_dataset` function can be used to apply a function to the dataset before the variables are selected. It could be used to create new variables from the existing ones. Args: cut (jidenn.data.utils.Cut.Cut, optional): The `Cut` object to be applied to the dataset. Defaults to `None`. map_dataset (Callable[[ROOTVariables], ROOTVariables], optional): The function to be applied to the dataset using `tf.data.Dataset.map`. Defaults to `None`. Raises: ValueError: If the dataset is not loaded yet. Returns: JIDENNDataset: The JIDENNDataset object with the signature of `(ROOTVariables, label, weight)`. """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') if map_dataset is not None: dataset = self.dataset.map(map_dataset) else: dataset = self.dataset dataset = dataset.filter(cut) if cut is not None else dataset dataset = dataset.map(self._var_picker) return self._set_dataset(dataset)
def filter(self, filter: Callable[[ROOTVariables], bool]) ‑> JIDENNDataset
-
Filters the dataset using the
filter
function.Args
filter
:Callable[[ROOTVariables], bool]
- Function to apply to the data.
Raises
ValueError
- If the dataset is not loaded yet.
Returns
JIDENNDataset
- The JIDENNDataset object with the dataset filtered.
Expand source code
def filter(self, filter: Callable[[ROOTVariables], bool]) -> JIDENNDataset: """Filters the dataset using the `filter` function. Args: filter (Callable[[ROOTVariables], bool]): Function to apply to the data. Raises: ValueError: If the dataset is not loaded yet. Returns: JIDENNDataset: The JIDENNDataset object with the dataset filtered. """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') dataset = self.dataset.filter(filter) return self._set_dataset(dataset)
def get_prepared_dataset(self, batch_size: int, assert_length: bool = False, shuffle_buffer_size: Optional[int] = None, take: Optional[int] = None, map_func: Optional[Callable[[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any], Tuple[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any]]] = None) ‑> tensorflow.python.data.ops.dataset_ops.DatasetV2
-
Returns a prepared dataset for training. The dataset is prepared by stacking the arrays in the
ROOTVariables
in the dataset usingdict_to_stacked_array()
. The dataset is also batched, shuffled, shortend (usingtake
) and mapped using themap_func
function. The function is applied before the input is stacked.Train input must be created with
JIDENNDataset.create_train_input()
before calling this method.The assertion allows displaying the estimated epoch time during training. The assertion is only performed if
take
is set.Args
batch_size
:int
- Batch size of the dataset.
assert_length
:bool
, optional- If
True
, the dataset is asserted to have thetake
length. It is only used if 'take' is set. Defaults to False. shuffle_buffer_size
:int
, optional- Size of the shuffle buffer. If
None
, the dataset is not shuffled. Defaults to None. take
:int
, optional- Number of elements to take from the dataset. If
None
, the dataset is not taken. Defaults to None. map_func
:Callable[[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any], Tuple[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any]]
, optional- Function to apply to the dataset. Defaults to None.
Raises
ValueError
- If the dataset is not loaded yet.
Returns
tf.data.Dataset
- The prepared dataset.
Expand source code
def get_prepared_dataset(self, batch_size: int, assert_length: bool = False, shuffle_buffer_size: Optional[int] = None, take: Optional[int] = None, map_func: Optional[Callable[[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any], Tuple[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any]]] = None) -> tf.data.Dataset: """Returns a prepared dataset for training. The dataset is prepared by stacking the arrays in the `ROOTVariables` in the dataset using `dict_to_stacked_array`. The dataset is also batched, shuffled, shortend (using `take`) and mapped using the `map_func` function. The function is applied before the input is stacked. **Train input must be created with `JIDENNDataset.create_train_input` before calling this method.** The assertion allows displaying the estimated epoch time during training. The assertion is only performed if `take` is set. Args: batch_size (int): Batch size of the dataset. assert_length (bool, optional): If `True`, the dataset is asserted to have the `take` length. It is only used if 'take' is set. Defaults to False. shuffle_buffer_size (int, optional): Size of the shuffle buffer. If `None`, the dataset is not shuffled. Defaults to None. take (int, optional): Number of elements to take from the dataset. If `None`, the dataset is not taken. Defaults to None. map_func (Callable[[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any], Tuple[Union[ROOTVariables, Tuple[ROOTVariables, ROOTVariables]], Any]], optional): Function to apply to the dataset. Defaults to None. Raises: ValueError: If the dataset is not loaded yet. Returns: tf.data.Dataset: The prepared dataset. """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') if map_func is not None: dataset = self.dataset.map(map_func) else: dataset = self.dataset.map(dict_to_stacked_array) dataset = dataset.shuffle( shuffle_buffer_size) if shuffle_buffer_size is not None else dataset if take is not None: dataset = dataset.take(take) dataset = dataset.apply(tf.data.experimental.assert_cardinality( take)) if assert_length else dataset dataset = dataset.apply( tf.data.experimental.dense_to_ragged_batch(batch_size)) # dataset = dataset.ragged_batch(batch_size) dataset = dataset.prefetch(tf.data.AUTOTUNE) return dataset
def load_dataset(self, file: str) ‑> JIDENNDataset
-
Loads a dataset from a file. The dataset is stored in the
tf.data.Dataset
format. Theelement_spec
is loaded from theelement_spec
file inside the dataset directory.
Alternatively, theelement_spec
can be loaded manually using theload_element_spec
method.Args
file
:str
- The path to the dataset directory.
Returns
JIDENNDataset
- The JIDENNDataset object with set dataset and
element_spec
.
Expand source code
def load_dataset(self, file: str) -> JIDENNDataset: """Loads a dataset from a file. The dataset is stored in the `tf.data.Dataset` format. The `element_spec` is loaded from the `element_spec` file inside the dataset directory. Alternatively, the `element_spec` can be loaded manually using the `load_element_spec` method. Args: file (str): The path to the dataset directory. Returns: JIDENNDataset: The JIDENNDataset object with set dataset and `element_spec`. """ if self.element_spec is None: element_spec_file = os.path.join(file, 'element_spec') jidenn_dataset = self.load_element_spec(element_spec_file) else: jidenn_dataset = self dataset = tf.data.Dataset.load( file, compression='GZIP', element_spec=jidenn_dataset.element_spec) return jidenn_dataset._set_dataset(dataset)
def load_element_spec(self, file: str) ‑> JIDENNDataset
-
Loads the
element_spec
from a file. Theelement_spec
is a pickled dictionary oftf.TensorSpec
ortf.RaggedTensorSpec
objects.Args
file
:str
- The path to the
element_spec
file.
Returns
JIDENNDataset
- The JIDENNDataset object with the
element_spec
set.
Expand source code
def load_element_spec(self, file: str) -> JIDENNDataset: """Loads the `element_spec` from a file. The `element_spec` is a pickled dictionary of `tf.TensorSpec` or `tf.RaggedTensorSpec` objects. Args: file (str): The path to the `element_spec` file. Returns: JIDENNDataset: The JIDENNDataset object with the `element_spec` set. """ with open(file, 'rb') as f: element_spec = pickle.load(f) return self._set_element_spec(element_spec)
def plot_data_distributions(self, folder: str, variables: Optional[List[str]] = None, hue_variable: Optional[str] = None, named_labels: Optional[Dict[int, str]] = None, xlabel_mapper: Optional[Dict[str, str]] = None) ‑> None
-
Plots the data distributions of the dataset. The dataset must be loaded before calling this function. The function uses
plot_data_distributions()
to plot the data distributions.Args
folder
:str
- The path to the directory where the plots are saved.
variables
:Optional[List[str]]
, optional- List of variables to plot. If
None
, all variables are plotted. Defaults toNone
. named_labels
:Dict[int, str]
, optional- Dictionary mapping truth values to custom labels. If not provided, the truth values will be used as labels.
Raises
ValueError
- If the dataset is not loaded yet.
Returns
None
Expand source code
def plot_data_distributions(self, folder: str, variables: Optional[List[str]] = None, hue_variable: Optional[str] = None, named_labels: Optional[Dict[int, str]] = None, xlabel_mapper: Optional[Dict[str, str]] = None) -> None: """Plots the data distributions of the dataset. The dataset must be loaded before calling this function. The function uses `jidenn.evaluation.plotter.plot_data_distributions` to plot the data distributions. Args: folder (str): The path to the directory where the plots are saved. variables (Optional[List[str]], optional): List of variables to plot. If `None`, all variables are plotted. Defaults to `None`. named_labels (Dict[int, str], optional): Dictionary mapping truth values to custom labels. If not provided, the truth values will be used as labels. Raises: ValueError: If the dataset is not loaded yet. Returns: None """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') df = self.to_pandas(variables) plot_data_distributions(df, folder=folder, named_labels=named_labels, xlabel_mapper=xlabel_mapper, hue_variable=hue_variable)
def remap_labels(self, label_mapping: Callable[[int], int]) ‑> JIDENNDataset
-
Remaps the labels in the dataset using the
label_mapping
function. Should be used after thecreate_variables
method.Args
label_mapping
:Callable[[int], int]
- The function that maps the labels.
Raises
ValueError
- If the dataset is not loaded yet.
ValueError
- If the
target
is not set.
Returns
JIDENNDataset
- The JIDENNDataset object where the
label
is remapped.
Expand source code
def remap_labels(self, label_mapping: Callable[[int], int]) -> JIDENNDataset: """Remaps the labels in the dataset using the `label_mapping` function. Should be used after the `create_variables` method. Args: label_mapping (Callable[[int], int]): The function that maps the labels. Raises: ValueError: If the dataset is not loaded yet. ValueError: If the `target` is not set. Returns: JIDENNDataset: The JIDENNDataset object where the `label` is remapped. """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') if self.target is None: raise ValueError('Target not set yet.') if self.weight is not None: @tf.function def remap_label(x, y, w): return x, label_mapping(y), w else: @tf.function def remap_label(x, y): return x, label_mapping(y) dataset = self.dataset.map(remap_label) return self._set_dataset(dataset)
def resample_dataset(self, resampling_func: Callable[[ROOTVariables, Any], int], target_dist: List[float])
-
Resamples the dataset using the
resampling_func
function. The function computes the bin index for each sample in the dataset. The dataset is then resampled to match thetarget_dist
distribution. Be careful that this may slow down the training process, if the target distribution is very different from the original one as the dataset is resampled on the fly and is waiting for the appropriate sample to be drawn.Args
resampling_func
:Callable[[ROOTVariables, Any], int]
- Function that bins the data. It must return an integer between 0 and
len(target_dist) - 1
. target_dist
:List[float]
- The target distribution of the resampled dataset.
Raises
ValueError
- If the dataset is not loaded yet.
Returns
JIDENNDataset
- The JIDENNDataset object where the dataset is resampled.
Expand source code
def resample_dataset(self, resampling_func: Callable[[ROOTVariables, Any], int], target_dist: List[float]): """Resamples the dataset using the `resampling_func` function. The function computes the bin index for each sample in the dataset. The dataset is then resampled to match the `target_dist` distribution. Be careful that this may **slow down the training process**, if the target distribution is very different from the original one as the dataset is resampled on the fly and is waiting for the appropriate sample to be drawn. Args: resampling_func (Callable[[ROOTVariables, Any], int]): Function that bins the data. It must return an integer between 0 and `len(target_dist) - 1`. target_dist (List[float]): The target distribution of the resampled dataset. Raises: ValueError: If the dataset is not loaded yet. Returns: JIDENNDataset: The JIDENNDataset object where the dataset is resampled. """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') @tf.function def _data_only(x, data): return data dataset = self.dataset.rejection_resample( resampling_func, target_dist=target_dist).map(_data_only) return self._set_dataset(dataset)
def save_dataset(self, file: str, num_shards: Optional[int] = None) ‑> None
-
Saves the dataset to a file. The dataset is stored in the
tf.data.Dataset
format. Theelement_spec
is stored in theelement_spec
file inside the dataset directory. Tensorflow saves theelement_spec.pb
automatically, but manual save is required for further processing of the dataset. Ternsorflow file has the.pb
extension.Args
file
:str
- The path to the dataset directory.
num_shards
:int
, optional- The number of shards to split the dataset into. Defaults to
None
. The sharding is done uniformly intonum_shards
files.
Raises
ValueError
- If the dataset is not loaded yet.
Returns
None
Expand source code
def save_dataset(self, file: str, num_shards: Optional[int] = None) -> None: """Saves the dataset to a file. The dataset is stored in the `tf.data.Dataset` format. The `element_spec` is stored in the `element_spec` file inside the dataset directory. Tensorflow saves the `element_spec.pb` automatically, but manual save is required for further processing of the dataset. Ternsorflow file has the `.pb` extension. Args: file (str): The path to the dataset directory. num_shards (int, optional): The number of shards to split the dataset into. Defaults to `None`. The sharding is done uniformly into `num_shards` files. Raises: ValueError: If the dataset is not loaded yet. Returns: None """ if self.dataset is None: raise ValueError('Dataset not loaded yet.') @tf.function def random_shards(_) -> tf.Tensor: return tf.random.uniform(shape=[], minval=0, maxval=num_shards, dtype=tf.int64) self.dataset.save(file, compression='GZIP', shard_func=random_shards if num_shards is not None else None) with open(os.path.join(file, 'element_spec'), 'wb') as f: pickle.dump(self.dataset.element_spec, f)
def set_dataset(self, dataset: tf.data.Dataset, element_spec: Dict[str, Union[tf.TensorSpec, tf.RaggedTensorSpec]]) ‑> JIDENNDataset
-
Sets the
tf.data.Dataset
object and theelement_spec
of the dataset.Args
dataset
:tf.data.Dataset
- The
tf.data.Dataset
object consisting ofROOTVariables
. element_spec
:Dict[str, Union[tf.TensorSpec, tf.RaggedTensorSpec]]
- The
element_spec
of the dataset.
Returns
JIDENNDataset
- The JIDENNDataset object with the
dataset
andelement_spec
set.
Expand source code
def set_dataset(self, dataset: tf.data.Dataset, element_spec: Dict[str, Union[tf.TensorSpec, tf.RaggedTensorSpec]]) -> JIDENNDataset: """Sets the `tf.data.Dataset` object and the `element_spec` of the dataset. Args: dataset (tf.data.Dataset): The `tf.data.Dataset` object consisting of `ROOTVariables`. element_spec (Dict[str, Union[tf.TensorSpec, tf.RaggedTensorSpec]]): The `element_spec` of the dataset. Returns: JIDENNDataset: The JIDENNDataset object with the `dataset` and `element_spec` set. """ jidenn_dataset = JIDENNDataset(variables=self.variables, target=self.target, weight=self.weight) jidenn_dataset._dataset = dataset jidenn_dataset._element_spec = element_spec return jidenn_dataset
def to_pandas(self, variables: Optional[List[str]] = None) ‑> pandas.core.frame.DataFrame
-
Converts the dataset to a pandas DataFrame. The dataset must be loaded before calling this function. The function uses
tensorflow_datasets.as_dataframe
to convert the dataset to a pandas DataFrame, so thetensorflow_datasets
package must be installed.Be careful that this function may take a long time to run, depending on the size of the dataset. Consider taking only a subset of the dataset before converting it to a pandas DataFrame.
jidenn_dataset = JIDENNDataset(...) ... jidenn_dataset = jidenn_dataset.apply(lambda dataset: dataset.take(1_000)) df = jidenn_dataset.to_pandas()
If the dataset contains nested tuples consider using
explode_nested_variables()
on the tuple columns of the convereted dataframe.Args
variables
:Optional[List[str]]
, optional- List of variables to convert to a pandas DataFrame. If
None
, all variables are converted. Defaults toNone
.
Raises
ImportError
- If
tensorflow_datasets
is not installed. ValueError
- If the dataset is not loaded yet.
Returns
pd.DataFrame
- The
tf.data.Dataset
converted to a pandaspd.DataFrame
.
Expand source code
def to_pandas(self, variables: Optional[List[str]] = None) -> pd.DataFrame: """Converts the dataset to a pandas DataFrame. The dataset must be loaded before calling this function. The function uses `tensorflow_datasets.as_dataframe` to convert the dataset to a pandas DataFrame, so the `tensorflow_datasets` package must be installed. Be careful that this function may take a **long time to run**, depending on the size of the dataset. Consider taking only a subset of the dataset before converting it to a pandas DataFrame. ```python jidenn_dataset = JIDENNDataset(...) ... jidenn_dataset = jidenn_dataset.apply(lambda dataset: dataset.take(1_000)) df = jidenn_dataset.to_pandas() ``` If the dataset contains nested tuples consider using `jidenn.data.data_info.explode_nested_variables` on the tuple columns of the convereted dataframe. Args: variables (Optional[List[str]], optional): List of variables to convert to a pandas DataFrame. If `None`, all variables are converted. Defaults to `None`. Raises: ImportError: If `tensorflow_datasets` is not installed. ValueError: If the dataset is not loaded yet. Returns: pd.DataFrame: The `tf.data.Dataset` converted to a pandas `pd.DataFrame`. """ try: import tensorflow_datasets as tfds except ImportError: raise ImportError( 'Please install tensorflow_datasets to use this function. Use `pip install tensorflow_datasets`.') if self.dataset is None: raise ValueError('Dataset not loaded yet.') if isinstance(self.element_spec, tuple) and variables is None: @tf.function def tuple_to_dict(data, label, weight=None): if isinstance(data, tuple): data = {**data[0], **data[1]} data = {**data, 'label': label, 'weight': weight} return data elif isinstance(self.element_spec, tuple) and variables is not None: @tf.function def tuple_to_dict(data, label, weight=None): if isinstance(data, tuple): data = {**data[0], **data[1]} data = {**data, 'label': label, 'weight': weight} return {k: data[k] for k in variables + ['label', 'weight']} elif isinstance(self.element_spec, dict) and variables is not None: @tf.function def tuple_to_dict(data): return {k: data[k] for k in variables} elif isinstance(self.element_spec, dict) and variables is None: @tf.function def tuple_to_dict(data): return data else: raise ValueError('The dataset must be a tuple or a dict.') dataset = self.dataset.map(tuple_to_dict) df = tfds.as_dataframe(dataset) df = df.rename(lambda x: x.replace('/', '.'), axis='columns') return df