Module `jidenn.preprocess.dataset_ops`

Expand source code

import tensorflow as tf
import os
import argparse
import pickle
from typing import Tuple, List, Dict, Union, Optional

ROOTVariables = Dict[str, tf.RaggedTensor]


def load_dataset(file_path: str, element_spec_path: Optional[str] = None) -> tf.data.Dataset:
    """Load a dataset from a file path.

    Args:
        file_path (str): Path to the saved dataset with tf.data.Dataset.save() 
        element_spec_path (str, optional): Path to the element spec pickle file. Defaults to None.
            If None, the element spec will be loaded from file_path/element_spec.

    Returns:
        tf.data.Dataset: The loaded dataset
    """
    element_spec = None
    if element_spec_path is not None:
        with open(element_spec_path, 'rb') as f:
            element_spec = pickle.load(f)
    else:
        try:
            with open(os.path.join(file_path, 'element_spec'), 'rb') as f:
                element_spec = pickle.load(f)
        except FileNotFoundError:
            print("No element spec found, graph mode will not work")

    dataset = tf.data.Dataset.load(file_path, compression='GZIP', element_spec=element_spec)
    return dataset


def split_train_dev_test(dataset: tf.data.Dataset,
                         train_fraction: float,
                         dev_fraction: float,
                         test_fraction: float) -> Tuple[tf.data.Dataset, tf.data.Dataset, tf.data.Dataset]:
    """Split a dataset into train, dev and test sets. The fractions must sum to 1.0 and only 2 decimal places 
    are taken into account. The cardinality of the returned datasets might not be known as the spliting is done
    by random filtering.

    Args:
        dataset (tf.data.Dataset): The dataset to split
        train_fraction (float): The fraction of the dataset to use for training dataset.
        dev_fraction (float): The fraction of the dataset to use for development dataset.
        test_fraction (float): The fraction of the dataset to use for testing dataset.

    Returns:
        Tuple[tf.data.Dataset, tf.data.Dataset, tf.data.Dataset]: The train, dev and test datasets.
    """

    train_fraction = round(train_fraction, 2)
    dev_fraction = round(dev_fraction, 2)
    test_fraction = round(test_fraction, 2)

    assert train_fraction + dev_fraction + test_fraction == 1.0, "Fractions must sum to 1.0 and can only have 2 decimal places."

    @tf.function
    def random_number() -> tf.Tensor:
        return tf.random.uniform(shape=[], minval=1, maxval=100, dtype=tf.int32)

    @tf.function
    def train_filter(sample: ROOTVariables, random_number: tf.Tensor) -> tf.Tensor:
        return tf.less_equal(random_number, train_fraction * 100)

    @tf.function
    def dev_filter(sample: ROOTVariables, random_number: tf.Tensor) -> tf.Tensor:
        return tf.greater(random_number, train_fraction * 100) and tf.less_equal(random_number, (train_fraction + dev_fraction) * 100)

    @tf.function
    def test_filter(sample: ROOTVariables, random_number: tf.Tensor) -> tf.Tensor:
        return tf.greater(random_number, (train_fraction + dev_fraction) * 100)

    @tf.function
    def delete_random_number(sample: ROOTVariables, random_number: tf.Tensor) -> ROOTVariables:
        return sample

    return (
        dataset.map(random_number).filter(train_filter).map(delete_random_number),
        dataset.map(random_number).filter(dev_filter).map(delete_random_number),
        dataset.map(random_number).filter(test_filter).map(delete_random_number)
    )


def save_dataset(dataset: tf.data.Dataset, file_path: str, num_shards: int = 256) -> None:
    """Save a dataset to a file path, with uniform sharding. The dataset will 
    be saved in the GZIP format. The element spec will be saved in a pickle file
    in the same directory as the dataset with name `element_spec`.

    Args:
        dataset (tf.data.Dataset): The dataset to save
        file_path (str): The path to save the dataset to
        num_shards (int, optional): The number of shards to use. Defaults to 256.

    """

    @tf.function
    def gen_random_number(sample: ROOTVariables) -> tf.Tensor:
        return tf.random.uniform(shape=[], minval=0, maxval=num_shards, dtype=tf.int64)

    dataset.save(file_path, compression='GZIP', shard_func=gen_random_number)

    with open(os.path.join(file_path, 'element_spec'), 'wb') as f:
        pickle.dump(dataset.element_spec, f)

Functions

def load_dataset(file_path: str, element_spec_path: Optional[str] = None) ‑> tensorflow.python.data.ops.dataset_ops.DatasetV2

Load a dataset from a file path.

Args

file_path : str: Path to the saved dataset with tf.data.Dataset.save()
element_spec_path : str, optional: Path to the element spec pickle file. Defaults to None. If None, the element spec will be loaded from file_path/element_spec.

Returns

tf.data.Dataset: The loaded dataset

Expand source code

def load_dataset(file_path: str, element_spec_path: Optional[str] = None) -> tf.data.Dataset:
    """Load a dataset from a file path.

    Args:
        file_path (str): Path to the saved dataset with tf.data.Dataset.save() 
        element_spec_path (str, optional): Path to the element spec pickle file. Defaults to None.
            If None, the element spec will be loaded from file_path/element_spec.

    Returns:
        tf.data.Dataset: The loaded dataset
    """
    element_spec = None
    if element_spec_path is not None:
        with open(element_spec_path, 'rb') as f:
            element_spec = pickle.load(f)
    else:
        try:
            with open(os.path.join(file_path, 'element_spec'), 'rb') as f:
                element_spec = pickle.load(f)
        except FileNotFoundError:
            print("No element spec found, graph mode will not work")

    dataset = tf.data.Dataset.load(file_path, compression='GZIP', element_spec=element_spec)
    return dataset

def save_dataset(dataset: tensorflow.python.data.ops.dataset_ops.DatasetV2, file_path: str, num_shards: int = 256) ‑> None

Save a dataset to a file path, with uniform sharding. The dataset will be saved in the GZIP format. The element spec will be saved in a pickle file in the same directory as the dataset with name element_spec.

Args

dataset : tf.data.Dataset: The dataset to save
file_path : str: The path to save the dataset to
num_shards : int, optional: The number of shards to use. Defaults to 256.

Expand source code

def save_dataset(dataset: tf.data.Dataset, file_path: str, num_shards: int = 256) -> None:
    """Save a dataset to a file path, with uniform sharding. The dataset will 
    be saved in the GZIP format. The element spec will be saved in a pickle file
    in the same directory as the dataset with name `element_spec`.

    Args:
        dataset (tf.data.Dataset): The dataset to save
        file_path (str): The path to save the dataset to
        num_shards (int, optional): The number of shards to use. Defaults to 256.

    """

    @tf.function
    def gen_random_number(sample: ROOTVariables) -> tf.Tensor:
        return tf.random.uniform(shape=[], minval=0, maxval=num_shards, dtype=tf.int64)

    dataset.save(file_path, compression='GZIP', shard_func=gen_random_number)

    with open(os.path.join(file_path, 'element_spec'), 'wb') as f:
        pickle.dump(dataset.element_spec, f)

def split_train_dev_test(dataset: tensorflow.python.data.ops.dataset_ops.DatasetV2, train_fraction: float, dev_fraction: float, test_fraction: float) ‑> Tuple[tensorflow.python.data.ops.dataset_ops.DatasetV2, tensorflow.python.data.ops.dataset_ops.DatasetV2, tensorflow.python.data.ops.dataset_ops.DatasetV2]

Split a dataset into train, dev and test sets. The fractions must sum to 1.0 and only 2 decimal places are taken into account. The cardinality of the returned datasets might not be known as the spliting is done by random filtering.

Args

dataset : tf.data.Dataset: The dataset to split
train_fraction : float: The fraction of the dataset to use for training dataset.
dev_fraction : float: The fraction of the dataset to use for development dataset.
test_fraction : float: The fraction of the dataset to use for testing dataset.

Returns

Tuple[tf.data.Dataset, tf.data.Dataset, tf.data.Dataset]: The train, dev and test datasets.

Expand source code

def split_train_dev_test(dataset: tf.data.Dataset,
                         train_fraction: float,
                         dev_fraction: float,
                         test_fraction: float) -> Tuple[tf.data.Dataset, tf.data.Dataset, tf.data.Dataset]:
    """Split a dataset into train, dev and test sets. The fractions must sum to 1.0 and only 2 decimal places 
    are taken into account. The cardinality of the returned datasets might not be known as the spliting is done
    by random filtering.

    Args:
        dataset (tf.data.Dataset): The dataset to split
        train_fraction (float): The fraction of the dataset to use for training dataset.
        dev_fraction (float): The fraction of the dataset to use for development dataset.
        test_fraction (float): The fraction of the dataset to use for testing dataset.

    Returns:
        Tuple[tf.data.Dataset, tf.data.Dataset, tf.data.Dataset]: The train, dev and test datasets.
    """

    train_fraction = round(train_fraction, 2)
    dev_fraction = round(dev_fraction, 2)
    test_fraction = round(test_fraction, 2)

    assert train_fraction + dev_fraction + test_fraction == 1.0, "Fractions must sum to 1.0 and can only have 2 decimal places."

    @tf.function
    def random_number() -> tf.Tensor:
        return tf.random.uniform(shape=[], minval=1, maxval=100, dtype=tf.int32)

    @tf.function
    def train_filter(sample: ROOTVariables, random_number: tf.Tensor) -> tf.Tensor:
        return tf.less_equal(random_number, train_fraction * 100)

    @tf.function
    def dev_filter(sample: ROOTVariables, random_number: tf.Tensor) -> tf.Tensor:
        return tf.greater(random_number, train_fraction * 100) and tf.less_equal(random_number, (train_fraction + dev_fraction) * 100)

    @tf.function
    def test_filter(sample: ROOTVariables, random_number: tf.Tensor) -> tf.Tensor:
        return tf.greater(random_number, (train_fraction + dev_fraction) * 100)

    @tf.function
    def delete_random_number(sample: ROOTVariables, random_number: tf.Tensor) -> ROOTVariables:
        return sample

    return (
        dataset.map(random_number).filter(train_filter).map(delete_random_number),
        dataset.map(random_number).filter(dev_filter).map(delete_random_number),
        dataset.map(random_number).filter(test_filter).map(delete_random_number)
    )