Module jidenn.data.convert_h5

Expand source code
import tensorflow as tf
import uproot
import awkward as ak
import numpy as np
import os
import h5py


def get_jet_h5_iterator(filename, dataset, variable_names):
    def _h5_iterator():
        with h5py.File(filename, 'r') as hf:
            for dato in hf[dataset]:
                yield dato
                # yield {name: var for name, var in zip(variable_names, dato)}
    return _h5_iterator


def get_flow_h5_iterator(filename, dataset, variable_names):
    def _h5_iterator():
        with h5py.File(filename, 'r') as hf:
            for dato in hf[dataset]:
                jet = list(zip(*dato))
                is_good = np.array(jet[-1], dtype=np.int32)
                yield {name: np.array(var)[np.where(is_good)] for name, var in zip(variable_names, jet)}
    return _h5_iterator


def convert_h5_to_tfdataset(load_path,
                            jet_dataset='jets',
                            jet_variables=['jets_E', 'jets_eta', 'jets_pt', 'jets_phi',
                                           'jets_label', 'jets_num', 'event', 'mu', 'corr_mu'],
                            jet_types=[tf.float32, tf.float32, tf.float32, tf.float32,
                                       tf.int32, tf.float32, tf.int32, tf.float32, tf.float32],
                            flow_dataset='flow',
                            flow_variables=["jets_UFO_pt", "jets_UFO_energy", "jets_UFO_deta", "jets_UFO_dphi",
                                            "jets_UFO_dr", "jets_UFO_track_pt", "jets_UFO_d0", "jets_UFO_z0SinTheta"],
                            flow_types=[tf.float32, tf.float32, tf.float32, tf.float32,
                                        tf.float32, tf.float32, tf.float32, tf.float32],
                            ):

    jet_specs = {name: tf.TensorSpec(shape=(), dtype=tp) for name, tp in zip(jet_variables, jet_types)}
    jet_dataset = tf.data.Dataset.from_generator(
        get_jet_h5_iterator(load_path, jet_dataset, variable_names=jet_variables),
        output_signature=jet_specs
    )
    if flow_dataset is None:
        return jet_dataset
    flow_specs = {name: tf.TensorSpec(shape=(None, ), dtype=tp) for name, tp in zip(flow_variables, flow_types)}

    flow_dataset = tf.data.Dataset.from_generator(
        get_flow_h5_iterator(load_path, flow_dataset, variable_names=flow_variables),
        output_signature=flow_specs
    )

    return tf.data.Dataset.zip((jet_dataset, flow_dataset)).map(lambda x, y: {**x, **y})


if __name__ == '__main__':
    jet_iter = get_jet_h5_iterator('/Users/samueljankovych/Documents/jet_tagging/JIDENN/data/group.perf-jets.32603976._000001.output.h5', 'jets', ['jets_E', 'jets_eta', 'jets_pt', 'jets_phi',
                                                                                                                                                   'jets_label', 'jets_num', 'event', 'mu', 'corr_mu'])
    flow_iter = get_flow_h5_iterator('/Users/samueljankovych/Documents/jet_tagging/JIDENN/data/group.perf-jets.32603976._000001.output.h5', 'flow', [
                                     "jets_UFO_pt", "jets_UFO_energy", "jets_UFO_deta", "jets_UFO_dphi", "jets_UFO_dr", "jets_UFO_track_pt", "jets_UFO_d0", "jets_UFO_z0SinTheta"])

    for i in range(1):
        print(next(jet_iter()))
        print(next(flow_iter()))

Functions

def convert_h5_to_tfdataset(load_path, jet_dataset='jets', jet_variables=['jets_E', 'jets_eta', 'jets_pt', 'jets_phi', 'jets_label', 'jets_num', 'event', 'mu', 'corr_mu'], jet_types=[tf.float32, tf.float32, tf.float32, tf.float32, tf.int32, tf.float32, tf.int32, tf.float32, tf.float32], flow_dataset='flow', flow_variables=['jets_UFO_pt', 'jets_UFO_energy', 'jets_UFO_deta', 'jets_UFO_dphi', 'jets_UFO_dr', 'jets_UFO_track_pt', 'jets_UFO_d0', 'jets_UFO_z0SinTheta'], flow_types=[tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32])
Expand source code
def convert_h5_to_tfdataset(load_path,
                            jet_dataset='jets',
                            jet_variables=['jets_E', 'jets_eta', 'jets_pt', 'jets_phi',
                                           'jets_label', 'jets_num', 'event', 'mu', 'corr_mu'],
                            jet_types=[tf.float32, tf.float32, tf.float32, tf.float32,
                                       tf.int32, tf.float32, tf.int32, tf.float32, tf.float32],
                            flow_dataset='flow',
                            flow_variables=["jets_UFO_pt", "jets_UFO_energy", "jets_UFO_deta", "jets_UFO_dphi",
                                            "jets_UFO_dr", "jets_UFO_track_pt", "jets_UFO_d0", "jets_UFO_z0SinTheta"],
                            flow_types=[tf.float32, tf.float32, tf.float32, tf.float32,
                                        tf.float32, tf.float32, tf.float32, tf.float32],
                            ):

    jet_specs = {name: tf.TensorSpec(shape=(), dtype=tp) for name, tp in zip(jet_variables, jet_types)}
    jet_dataset = tf.data.Dataset.from_generator(
        get_jet_h5_iterator(load_path, jet_dataset, variable_names=jet_variables),
        output_signature=jet_specs
    )
    if flow_dataset is None:
        return jet_dataset
    flow_specs = {name: tf.TensorSpec(shape=(None, ), dtype=tp) for name, tp in zip(flow_variables, flow_types)}

    flow_dataset = tf.data.Dataset.from_generator(
        get_flow_h5_iterator(load_path, flow_dataset, variable_names=flow_variables),
        output_signature=flow_specs
    )

    return tf.data.Dataset.zip((jet_dataset, flow_dataset)).map(lambda x, y: {**x, **y})
def get_flow_h5_iterator(filename, dataset, variable_names)
Expand source code
def get_flow_h5_iterator(filename, dataset, variable_names):
    def _h5_iterator():
        with h5py.File(filename, 'r') as hf:
            for dato in hf[dataset]:
                jet = list(zip(*dato))
                is_good = np.array(jet[-1], dtype=np.int32)
                yield {name: np.array(var)[np.where(is_good)] for name, var in zip(variable_names, jet)}
    return _h5_iterator
def get_jet_h5_iterator(filename, dataset, variable_names)
Expand source code
def get_jet_h5_iterator(filename, dataset, variable_names):
    def _h5_iterator():
        with h5py.File(filename, 'r') as hf:
            for dato in hf[dataset]:
                yield dato
                # yield {name: var for name, var in zip(variable_names, dato)}
    return _h5_iterator