Module `jidenn.models.ParT`

Implementation of Particle Transformer model based on the paper https://arxiv.org/abs/2202.03772.

This model is a transformer model based on the CaiT transformer model (https://arxiv.org/abs/2103.17239), used in image classification tasks. On top of the jidenn.models.Transformer model, it adds a ClassAttention layers, which allows a more effective feature extraction. It also includes more layer normalization layers.

The only difference from CaiT, which is a novelty of this model, is the use of interaction variables. These are variables caomputed for each pair of intput particles, which are then used when calculating the attention weights as $\mathrm{Attention}(Q, K) = \mathrm{softmax} \left( \frac{QK^T}{\sqrt{d_k}} + U \right)$ where $U$ is the interaction matrix of shape (batch, num_particles, num_particles, heads), where each head gets its own interaction matrix.

ParT ParT ParT

Expand source code

r"""
Implementation of Particle Transformer model based on the paper https://arxiv.org/abs/2202.03772.

This model is a transformer model based on the CaiT transformer model (https://arxiv.org/abs/2103.17239), used in image classification tasks.
On top of the `jidenn.models.Transformer` model, it adds a ClassAttention layers, which allows a more effective feature extraction.
It also includes more layer normalization layers.

The only difference from CaiT, which is a novelty of this model, is the use of **interaction variables**.
These are variables caomputed for each pair of intput particles, which are then used when calculating the attention weights as 
$$ \mathrm{Attention}(Q, K) = \mathrm{softmax} \left( \frac{QK^T}{\sqrt{d_k}} + U \right)$$
where $U$ is the interaction matrix of shape `(batch, num_particles, num_particles, heads)`, where each `head` gets its own interaction matrix.

![ParT](images/part.png)
![ParT](images/part_layers_1.png)
![ParT](images/part_layers_2.png)

"""
import tensorflow as tf
from typing import Callable, Union, Tuple, Optional


class FFN(tf.keras.layers.Layer):
    """Feed-forward network
    On top of the Transformer FFN layer, it adds a layer normalization in between the two dense layers.

    Args:
        dim (int): dimension of the input and output
        expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion`
        activation (Callable[[tf.Tensor], tf.Tensor]) activation function
        dropout (float, optional): dropout rate. Defaults to None.
    """

    def __init__(self, dim: int, expansion: int, activation: Callable[[tf.Tensor], tf.Tensor], dropout: Optional[float] = None):
        super().__init__()
        self.dim, self.expansion, self.activation, self.dropout = dim, expansion, activation, dropout

        self.wide_dense = tf.keras.layers.Dense(dim * expansion, activation=activation)
        self.dense = tf.keras.layers.Dense(dim, activation=None)
        self.ln = tf.keras.layers.LayerNormalization()
        self.layer_dropout = tf.keras.layers.Dropout(dropout)

    def get_config(self):
        config = super(FFN, self).get_config()
        config.update({"dim": self.dim, "expansion": self.expansion,
                      "activation": self.activation, "dropout": self.dropout})
        return config

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        """Forward pass of the feed-forward network
        Includes a layer normalization layer in between the two dense layers

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`

        Returns:
            tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
        """
        output = self.wide_dense(inputs)
        output = self.ln(output)
        output = self.dense(output)
        output = self.layer_dropout(output)
        return output


class MultiheadSelfAttention(tf.keras.layers.Layer):
    """Multi-head self-attention layer
    Standalone implementation of the multi-head self-attention layer, which
    includes the interaction variables.

    Args:
        dim (int): dimension of the input and output
        heads (int): number of heads

    """

    def __init__(self, dim: int, heads: int, ):
        super().__init__()
        self.dim, self.heads = dim, heads

        self.linear_qkv = tf.keras.layers.Dense(dim * 3)
        self.linear_out = tf.keras.layers.Dense(dim)

    def get_config(self):
        config = super(MultiheadSelfAttention, self).get_config()
        config.update({"dim": self.dim, "heads": self.heads})
        return config

    def call(self, inputs: tf.Tensor, mask: tf.Tensor, interaction: Optional[tf.Tensor] = None) -> tf.Tensor:
        """Forward pass of the multi-head self-attention layer

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
            mask (tf.Tensor): mask tensor of shape `(batch_size, num_particles, num_particles)`
                This mask is used to mask out the attention of padding particles, generated when
                tf.RaggedTensor is converted to tf.Tensor.
            interaction (tf.Tensor, optional): interaction tensor of shape `(batch_size, num_particles, num_particles, heads)`

        Returns:
            tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
        """
        B, N, C = tf.shape(inputs)[0], tf.shape(inputs)[1], tf.shape(inputs)[2]

        qkv = self.linear_qkv(inputs)  # (B, N, C * 3)
        qkv = tf.reshape(qkv, [B, N, 3, self.heads, C // self.heads])  # (B, N, 3, H, C // H)
        qkv = tf.transpose(qkv, [2, 0, 3, 1, 4])  # (3, B, H, N, C // H)
        q, k, v = qkv[0], qkv[1], qkv[2]  # 3 x (B, H, N, C // H)

        attention_weights = tf.linalg.matmul(q, k, transpose_b=True) / (q.shape[-1] ** 0.5)  # (B, H, N, N)

        if interaction is not None:
            interaction = tf.transpose(interaction, [0, 3, 1, 2])  # (B, H, N, N)
            attention_weights += interaction

        attention = tf.keras.layers.Softmax()(attention_weights, mask=mask)  # (B, H, N, N)

        output = tf.linalg.matmul(attention, v)  # (B, H, N, C // H)
        output = tf.transpose(output, [0, 2, 1, 3])  # (B, N, H, C // H)
        output = tf.reshape(output, [B, N, C])  # (B, N, C)
        output = self.linear_out(output)  # (B, N, C)
        return output


class MultiheadClassAttention(tf.keras.layers.Layer):
    """Multi-head class attention layer
    This layer is a wrapper around the `tf.keras.layers.MultiHeadAttention` layer, 
    to fix the key, and value to be the same as the input, and only use the class token
    as the query.

    Args:
        dim (int): dimension of the input and output
        heads (int): number of heads
        dropout (float, optional): dropout rate, defaults to None
    """

    def __init__(self, dim: int, heads: int, dropout: Optional[float] = None):
        super().__init__()
        self.dim, self.heads, self.dropout = dim, heads, dropout
        self.mha = tf.keras.layers.MultiHeadAttention(key_dim=dim // heads, num_heads=heads)
        self.layer_dropout = tf.keras.layers.Dropout(dropout)

    def get_config(self):
        config = super(MultiheadClassAttention, self).get_config()
        config.update({"dim": self.dim, "heads": self.heads, "dropout": self.dropout})
        return config

    def call(self, inputs: tf.Tensor, class_token: tf.Tensor, mask: tf.Tensor) -> tf.Tensor:
        """Forward pass of the multi-head self-attention layer

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
            class_token (tf.Tensor): class token tensor of shape `(batch_size, 1, dim)`
            mask (tf.Tensor): mask tensor of shape `(batch_size, 1, num_particles)`
                This mask is used to mask out the attention of padding particles, generated when
                tf.RaggedTensor is converted to tf.Tensor.
        Returns:
            tf.Tensor: output tensor of shape `(batch_size, 1, dim)`
        """
        output = self.mha(query=class_token, value=inputs, key=inputs, attention_mask=mask)
        output = self.layer_dropout(output)
        return output


class SelfAttentionBlock(tf.keras.layers.Layer):
    """Self-attention block.
    It contains a multi-head self-attention layer and a feed-forward network with residual connections
    and layer normalizations. The self-attention layer includes the interaction variables.

    Args:
        dim (int): dimension of the input and output
        heads (int): number of heads
        expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion`
        activation (Callable[[tf.Tensor], tf.Tensor]) activation function
        dropout (float, optional): dropout rate. Defaults to None.
    """

    def __init__(self, dim: int, heads: int, expansion: int, activation: Callable[[tf.Tensor], tf.Tensor], dropout: Optional[float] = None):
        super().__init__()
        self.dim, self.heads, self.dropout, self.expansion, self.activation = dim, heads, dropout, expansion, activation

        self.pre_mhsa_ln = tf.keras.layers.LayerNormalization()
        self.mhsa = MultiheadSelfAttention(dim=dim, heads=heads)
        self.post_mhsa_ln = tf.keras.layers.LayerNormalization()
        self.mhsa_dropout = tf.keras.layers.Dropout(dropout)

        self.pre_ffn_ln = tf.keras.layers.LayerNormalization()
        self.ffn = FFN(dim=dim, expansion=expansion, activation=activation, dropout=dropout)

    def get_config(self):
        config = super().get_config()
        config.update({"dim": self.dim, "heads": self.heads, "dropout": self.dropout,
                      "expansion": self.expansion, "activation": self.activation})
        return config

    def call(self, inputs: tf.Tensor, mask: tf.Tensor, interaction: Optional[tf.Tensor] = None) -> tf.Tensor:
        """Forward pass of the self-attention block

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
            mask (tf.Tensor, optional): mask tensor of shape `(batch_size, num_particles, num_particles)`. Defaults to None.
                This mask is used to mask out the attention of padding particles, generated when
                tf.RaggedTensor is converted to tf.Tensor.
            interaction (tf.Tensor, optional): interaction tensor of shape `(batch_size, num_particles, num_particles, heads)`. Defaults to None.

        Returns:
            tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
        """
        attented = self.pre_mhsa_ln(inputs)
        attented = self.mhsa(inputs=attented, mask=mask, interaction=interaction)
        attented = self.post_mhsa_ln(attented)
        attented = self.mhsa_dropout(attented)
        attented = attented + inputs

        ffned = self.pre_ffn_ln(attented)
        ffned = self.ffn(ffned)
        output = ffned + attented

        return output


class ClassAttentionBlock(tf.keras.layers.Layer):
    """Class attention block.
    It allows the class token to attend to the input particles, and then feed the attended class token
    to the feed-forward network with residual connections and layer normalizations.

    This extracts the class information from the attented particles more effectively.

    Args:
        dim (int): dimension of the input and output
        heads (int): number of heads
        dropout (float, optional): dropout rate. Defaults to None.
        expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion`
    """

    def __init__(self, dim: int, heads: int, expansion: int, dropout: Optional[float] = None):
        super().__init__()
        self.dim, self.heads, self.dropout, self.expansion = dim, heads, dropout, expansion

        self.pre_mhca_ln = tf.keras.layers.LayerNormalization()
        self.mhca = MultiheadClassAttention(dim=dim, heads=heads, dropout=dropout)
        self.post_mhca_ln = tf.keras.layers.LayerNormalization()
        self.mhca_dropout = tf.keras.layers.Dropout(dropout)

        self.pre_ffn_ln = tf.keras.layers.LayerNormalization()
        self.ffn = FFN(dim=dim, expansion=expansion, activation=tf.nn.gelu, dropout=dropout)

    def get_config(self):
        config = super().get_config()
        config.update({"dim": self.dim, "heads": self.heads, "dropout": self.dropout})
        return config

    def call(self, inputs: tf.Tensor, class_token: tf.Tensor, mask: tf.Tensor) -> tf.Tensor:
        """Forward pass of the class attention block

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
            class_token (tf.Tensor): class token tensor of shape `(batch_size, 1, dim)`
                It is concatenated with the input tensor along the particle dimension,
                at the front of the input tensor.
            mask (tf.Tensor): mask tensor of shape `(batch_size, 1, num_particles)`
                This mask is used to mask out the attention of padding particles, generated when
                tf.RaggedTensor is converted to tf.Tensor.

        Returns:
            tf.Tensor: output tensor of shape `(batch_size, 1, dim)`, an updated class token
        """
        attented = tf.concat([class_token, inputs], axis=1)
        attented = self.pre_mhca_ln(attented)
        attented = self.mhca(inputs=attented, class_token=class_token, mask=mask)
        attented = self.post_mhca_ln(attented)
        attented = self.mhca_dropout(attented)
        attented = attented + class_token

        ffned = self.pre_ffn_ln(attented)
        ffned = self.ffn(ffned)
        output = ffned + attented
        return output


class ParT(tf.keras.layers.Layer):
    """Pure Particle Transformer (ParT) layers without the embedding and output layers.

    It also creates the class token, which is used to encode the global information of the input,
    using the ClassAttentionBlock.

    Args:
        dim (int): dimension of the input and output
        self_attn_layers (int): number of self-attention layers
        class_attn_layers (int): number of class-attention layers
        expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion`
        heads (int): number of heads
        activation (Callable[[tf.Tensor], tf.Tensor]) activation function
        dropout (float, optional): dropout rate. Defaults to None.
    """

    def __init__(self,
                 dim: int,
                 self_attn_layers: int,
                 class_attn_layers: int,
                 expansion: int,
                 heads: int,
                 activation: Callable[[tf.Tensor], tf.Tensor],
                 dropout: Optional[float] = None):
        # Make sure `dim` is even.
        assert dim % 2 == 0

        super().__init__()
        self.dim, self.expansion, self.heads, self.dropout, self.activation, self.num_selfattn_layers, self.num_class_layers = dim, expansion, heads, dropout, activation, self_attn_layers, class_attn_layers

        self.class_token = tf.Variable(tf.random.truncated_normal((1, 1, dim), stddev=0.02), trainable=True)
        self.sa_layers = [SelfAttentionBlock(dim, heads, expansion, activation, dropout)
                          for _ in range(self_attn_layers)]
        self.ca_layers = [ClassAttentionBlock(dim, heads, expansion, dropout) for _ in range(class_attn_layers)]

    def get_config(self):
        config = super(ParT, self).get_config()
        config.update({name: getattr(self, name)
                      for name in ["dim", "expansion", "heads", "dropout", "activation", "num_selfattn_layers", "num_class_layers"]})
        return config

    def call(self, inputs: tf.Tensor, mask: tf.Tensor, interaction: Optional[tf.Tensor] = None) -> tf.Tensor:
        """Forward pass of the ParT layers

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
            mask (tf.Tensor): mask tensor of shape `(batch_size, num_particles)`.
                From the mask, a mask tensor of shape `(batch_size, num_particles, num_particles)`
                is calculated, which is used to mask out the attention of padding particles, generated when
                `tf.RaggedTensor` is converted to `tf.Tensor`.
            interaction (tf.Tensor, optional): interaction tensor of shape `(batch_size, num_particles, num_particles, heads)`

        Returns:
            tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
        """
        sa_mask = mask[:, tf.newaxis, tf.newaxis, :] & mask[:, tf.newaxis, :, tf.newaxis]
        hidden = inputs
        for layer in self.sa_layers:
            hidden = layer(hidden, sa_mask, interaction)

        class_token = tf.tile(self.class_token, [tf.shape(inputs)[0], 1, 1])
        class_mask = mask[:, tf.newaxis, :]
        class_mask = tf.concat([tf.ones((tf.shape(inputs)[0], 1, 1), dtype=tf.bool), class_mask], axis=2)
        for layer in self.ca_layers:
            class_token = layer(hidden, class_token, class_mask)
        return class_token


class FCEmbedding(tf.keras.layers.Layer):
    """Embedding layer as a series of fully-connected layers.

    Args:
        embed_dim (int): dimension of the embedding
        embed_layers (int): number of fully-connected layers
        activation (Callable[[tf.Tensor], tf.Tensor]) activation function
    """

    def __init__(self, embedding_dim: int, num_embeding_layers: int, activation: Callable[[tf.Tensor], tf.Tensor], ):

        super().__init__()
        self.embedding_dim, self.activation, self.num_embeding_layers = embedding_dim, activation, num_embeding_layers
        self.layers = [tf.keras.layers.Dense(self.embedding_dim, activation=self.activation)
                       for _ in range(self.num_embeding_layers)]

    def get_config(self):
        config = super(FCEmbedding, self).get_config()
        config.update({name: getattr(self, name) for name in ["embedding_dim", "num_embeding_layers", "activation"]})
        return config

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        """Forward pass of the embedding layer

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, num_features)`

        Returns:
            tf.Tensor: output tensor of shape `(batch_size, num_particles, embed_dim)`
        """
        hidden = inputs
        for layer in self.layers:
            hidden = layer(hidden)
        return hidden


class CNNEmbedding(tf.keras.layers.Layer):
    """Embedding layer of the interaction variables as a series of point-wise convolutional layers.
    The interaction variiables are compuetd for each pair of particles.
    This creates a redundancy in the input, as the matrix is symetric and the diagonal is always zero.
    To save computation, the upper triangular part of the matrix is used as input, which is 
    flattend and the 1D convolutions are applied to it.

    Args:
        num_layers (int): number of convolutional layers
        layer_size (int): number of channels of the hidden layers
        out_dim (int): number of channels of the last convolutional layer which 
            is manually appended as an extra layer after `num_layers` layers.
        activation (Callable[[tf.Tensor], tf.Tensor]) activation function
    """

    def __init__(self, num_layers: int, layer_size: int, out_dim: int, activation: Callable[[tf.Tensor], tf.Tensor]):
        super().__init__()
        self.activation, self.num_layers, self.layer_size, self.out_dim = activation, num_layers, layer_size, out_dim

        self.conv_layers = [tf.keras.layers.Conv1D(layer_size, 1) for _ in range(num_layers)]
        self.conv_layers.append(tf.keras.layers.Conv1D(out_dim, 1))
        self.bn = [tf.keras.layers.BatchNormalization() for _ in range(num_layers + 1)]
        self.activation = tf.keras.layers.Activation(activation)

    def get_config(self):
        config = super(CNNEmbedding, self).get_config()
        config.update({name: getattr(self, name)
                      for name in ["num_layers", "layer_size", "out_dim", "activation"]})
        return config

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        """Forward pass of the interaction embedding layer

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, num_particles, num_features)`
                This matrix is assumed to be symetric, with zero diagonal, with only the upper-triag part 
                can be used fopr embedding. 

        Returns:
            tf.Tensor: output tensor of shape `(batch_size, num_particles, num_particles, out_dim)`
        """

        ones = tf.ones_like(inputs[0, :, :, 0])
        upper_tril_mask = tf.linalg.band_part(ones, 0, -1)
        diag_mask = tf.linalg.band_part(ones, 0, 0)
        upper_tril_mask = tf.cast(upper_tril_mask - diag_mask, tf.bool)
        flattened_upper_triag = tf.boolean_mask(inputs, upper_tril_mask, axis=1)

        hidden = flattened_upper_triag
        for conv, norm in zip(self.conv_layers, self.bn):
            hidden = conv(hidden)
            hidden = norm(hidden)
            hidden = self.activation(hidden)

        true_mask = tf.cast(tf.where(upper_tril_mask), tf.int32)
        out = tf.transpose(hidden, [1, 0, 2])
        out = tf.scatter_nd(true_mask, out, shape=[tf.shape(inputs)[1],
                            tf.shape(inputs)[2], tf.shape(inputs)[0], self.out_dim])
        out = out + tf.transpose(out, [1, 0, 2, 3])
        out = tf.transpose(out, [2, 0, 1, 3])
        return out


class ParTModel(tf.keras.Model):
    """ParT model with embwith embedding and output layers.

    The model already contains the `tf.keras.layers.Input` layer, so it can be used as a standalone model.

    The input tensor can be either a tensor of shape `(batch_size, num_particles, num_features)` or
    a tuple of tensors `(particle_tensor, interaction_tensor)` of shapes 
    `(batch_size, num_particles, num_features)` and `(batch_size, num_particles, num_particles, num_features)`, respectively.

    The model can be used with or without the interaction tensor, depending on the type of the input shape,
    if it is a tuple, the interaction tensor is assumed to be present.

    The input tensor is first passed through the embedding layer, then the ParT layers, and finally the output layer.
    If the interaction tensor is present, it is passed through the interaction embedding layer before the ParT layers.

    If the preprocessing layer is not None, the input tensor is first passed through the preprocessing layer before the embedding layer.
    If the interaction tensor is present, it is passed through the preprocessing layer is an tuple of two layers,
    each of which is applied to the particle and interaction tensors, respectively.

    The output of ParT is a vector of shape `(batch_size, embed_dim)` with extracted class infromation.
    This is then passed through the output layer.
    Layer normalization is applied to the output of the ParT layers before the output layer.

    Args:
        input_shape (Union[Tuple[None, int], Tuple[Tuple[None, int], Tuple[None, None, int]]]): shape of the input tensor.
            If the interaction tensor is present, it is assumed to be a tuple of two shapes,
            each creating a separate input layer.
        embed_dim (int): dimension of the embedding layer
        embed_layers (int): number of layers of the embedding layer
        self_attn_layers (int): number of self-attention layers
        class_attn_layers (int): number of class-attention layers
        expansion (int): expansion factor of the self-attention layers
        heads (int): number of heads of the self-attention layers
        output_layer (tf.keras.layers.Layer): output layer
        activation (Callable[[tf.Tensor], tf.Tensor]): activation function
        dropout (Optional[float], optional): dropout rate. Defaults to None.
        interaction_embed_layers (Optional[int], optional): number of layers of the interaction embedding layer. Defaults to None.
        interaction_embed_layer_size (Optional[int], optional): size of the layers of the interaction embedding layer. Defaults to None.
        preprocess (Union[tf.keras.layers.Layer, None, Tuple[tf.keras.layers.Layer, tf.keras.layers.Layer]], optional): preprocessing layer. Defaults to None.

    """

    def __init__(self,
                 input_shape: Union[Tuple[None, int], Tuple[Tuple[None, int], Tuple[None, None, int]]],
                 embed_dim: int,
                 embed_layers: int,
                 self_attn_layers: int,
                 class_attn_layers: int,
                 expansion: int,
                 heads: int,
                 output_layer: tf.keras.layers.Layer,
                 activation: Callable[[tf.Tensor], tf.Tensor],
                 dropout: Optional[float] = None,
                 interaction_embed_layers: Optional[int] = None,
                 interaction_embed_layer_size: Optional[int] = None,
                 preprocess: Union[tf.keras.layers.Layer, None, Tuple[tf.keras.layers.Layer, tf.keras.layers.Layer]] = None):

        if isinstance(input_shape, tuple) and isinstance(input_shape[0], tuple):
            input = (tf.keras.layers.Input(shape=input_shape[0], ragged=True),
                     tf.keras.layers.Input(shape=input_shape[1], ragged=True))
            row_lengths = input[0].row_lengths()
            hidden = input[0].to_tensor()
            interaction_hidden = input[1].to_tensor()

            if preprocess is not None:
                if not isinstance(preprocess, tuple):
                    raise ValueError(
                        "preprocess must be a tuple of two layers when the input is a tuple of two tensors.")

                preprocess, interaction_preprocess = preprocess
                if interaction_preprocess is not None:
                    interaction_hidden = interaction_preprocess(interaction_hidden)

            if interaction_embed_layers is None or interaction_embed_layer_size is None:
                raise ValueError(
                    """interaction_embed_layers and interaction_embed_layer_size must be specified 
                    when the input is a tuple of two tensors, i.e. the interaction variables are used.""")

            embed_interaction = CNNEmbedding(
                interaction_embed_layers,
                interaction_embed_layer_size,
                heads,
                activation)(interaction_hidden)
        else:
            input = tf.keras.layers.Input(shape=input_shape, ragged=True)
            embed_interaction = None
            row_lengths = input.row_lengths()
            hidden = input.to_tensor()

        if preprocess is not None:
            if isinstance(preprocess, tuple):
                raise ValueError("preprocess must be a single layer when the input is a single tensor.")
            hidden = preprocess(hidden)

        hidden = FCEmbedding(embed_dim, embed_layers, activation)(hidden)

        transformed = ParT(dim=embed_dim,
                           self_attn_layers=self_attn_layers,
                           class_attn_layers=class_attn_layers,
                           expansion=expansion,
                           heads=heads,
                           dropout=dropout,
                           activation=activation)(hidden, tf.sequence_mask(row_lengths), embed_interaction)

        transformed = tf.keras.layers.LayerNormalization()(transformed)
        output = output_layer(transformed[:, 0, :])

        super().__init__(inputs=input, outputs=output)

Classes

class CNNEmbedding (num_layers: int, layer_size: int, out_dim: int, activation: Callable[[tensorflow.python.framework.ops.Tensor], tensorflow.python.framework.ops.Tensor])

Embedding layer of the interaction variables as a series of point-wise convolutional layers. The interaction variiables are compuetd for each pair of particles. This creates a redundancy in the input, as the matrix is symetric and the diagonal is always zero. To save computation, the upper triangular part of the matrix is used as input, which is flattend and the 1D convolutions are applied to it.

Args

num_layers : int: number of convolutional layers
layer_size : int: number of channels of the hidden layers
out_dim : int: number of channels of the last convolutional layer which is manually appended as an extra layer after num_layers layers.

activation (Callable[[tf.Tensor], tf.Tensor]) activation function

Expand source code

class CNNEmbedding(tf.keras.layers.Layer):
    """Embedding layer of the interaction variables as a series of point-wise convolutional layers.
    The interaction variiables are compuetd for each pair of particles.
    This creates a redundancy in the input, as the matrix is symetric and the diagonal is always zero.
    To save computation, the upper triangular part of the matrix is used as input, which is 
    flattend and the 1D convolutions are applied to it.

    Args:
        num_layers (int): number of convolutional layers
        layer_size (int): number of channels of the hidden layers
        out_dim (int): number of channels of the last convolutional layer which 
            is manually appended as an extra layer after `num_layers` layers.
        activation (Callable[[tf.Tensor], tf.Tensor]) activation function
    """

    def __init__(self, num_layers: int, layer_size: int, out_dim: int, activation: Callable[[tf.Tensor], tf.Tensor]):
        super().__init__()
        self.activation, self.num_layers, self.layer_size, self.out_dim = activation, num_layers, layer_size, out_dim

        self.conv_layers = [tf.keras.layers.Conv1D(layer_size, 1) for _ in range(num_layers)]
        self.conv_layers.append(tf.keras.layers.Conv1D(out_dim, 1))
        self.bn = [tf.keras.layers.BatchNormalization() for _ in range(num_layers + 1)]
        self.activation = tf.keras.layers.Activation(activation)

    def get_config(self):
        config = super(CNNEmbedding, self).get_config()
        config.update({name: getattr(self, name)
                      for name in ["num_layers", "layer_size", "out_dim", "activation"]})
        return config

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        """Forward pass of the interaction embedding layer

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, num_particles, num_features)`
                This matrix is assumed to be symetric, with zero diagonal, with only the upper-triag part 
                can be used fopr embedding. 

        Returns:
            tf.Tensor: output tensor of shape `(batch_size, num_particles, num_particles, out_dim)`
        """

        ones = tf.ones_like(inputs[0, :, :, 0])
        upper_tril_mask = tf.linalg.band_part(ones, 0, -1)
        diag_mask = tf.linalg.band_part(ones, 0, 0)
        upper_tril_mask = tf.cast(upper_tril_mask - diag_mask, tf.bool)
        flattened_upper_triag = tf.boolean_mask(inputs, upper_tril_mask, axis=1)

        hidden = flattened_upper_triag
        for conv, norm in zip(self.conv_layers, self.bn):
            hidden = conv(hidden)
            hidden = norm(hidden)
            hidden = self.activation(hidden)

        true_mask = tf.cast(tf.where(upper_tril_mask), tf.int32)
        out = tf.transpose(hidden, [1, 0, 2])
        out = tf.scatter_nd(true_mask, out, shape=[tf.shape(inputs)[1],
                            tf.shape(inputs)[2], tf.shape(inputs)[0], self.out_dim])
        out = out + tf.transpose(out, [1, 0, 2, 3])
        out = tf.transpose(out, [2, 0, 1, 3])
        return out

Ancestors

keras.engine.base_layer.Layer
tensorflow.python.module.module.Module
tensorflow.python.trackable.autotrackable.AutoTrackable
tensorflow.python.trackable.base.Trackable
keras.utils.version_utils.LayerVersionSelector

Methods

def call(self, inputs: tensorflow.python.framework.ops.Tensor) ‑> tensorflow.python.framework.ops.Tensor

Forward pass of the interaction embedding layer

Args

inputs : tf.Tensor: input tensor of shape (batch_size, num_particles, num_particles, num_features) This matrix is assumed to be symetric, with zero diagonal, with only the upper-triag part can be used fopr embedding.

Returns

tf.Tensor: output tensor of shape (batch_size, num_particles, num_particles, out_dim)

Expand source code

def call(self, inputs: tf.Tensor) -> tf.Tensor:
    """Forward pass of the interaction embedding layer

    Args:
        inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, num_particles, num_features)`
            This matrix is assumed to be symetric, with zero diagonal, with only the upper-triag part 
            can be used fopr embedding. 

    Returns:
        tf.Tensor: output tensor of shape `(batch_size, num_particles, num_particles, out_dim)`
    """

    ones = tf.ones_like(inputs[0, :, :, 0])
    upper_tril_mask = tf.linalg.band_part(ones, 0, -1)
    diag_mask = tf.linalg.band_part(ones, 0, 0)
    upper_tril_mask = tf.cast(upper_tril_mask - diag_mask, tf.bool)
    flattened_upper_triag = tf.boolean_mask(inputs, upper_tril_mask, axis=1)

    hidden = flattened_upper_triag
    for conv, norm in zip(self.conv_layers, self.bn):
        hidden = conv(hidden)
        hidden = norm(hidden)
        hidden = self.activation(hidden)

    true_mask = tf.cast(tf.where(upper_tril_mask), tf.int32)
    out = tf.transpose(hidden, [1, 0, 2])
    out = tf.scatter_nd(true_mask, out, shape=[tf.shape(inputs)[1],
                        tf.shape(inputs)[2], tf.shape(inputs)[0], self.out_dim])
    out = out + tf.transpose(out, [1, 0, 2, 3])
    out = tf.transpose(out, [2, 0, 1, 3])
    return out

def get_config(self)

Returns the config of the layer.

A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration.

The config of a layer does not include connectivity information, nor the layer class name. These are handled by Network (one layer of abstraction above).

Note that get_config() does not guarantee to return a fresh copy of dict every time it is called. The callers should make a copy of the returned dict if they want to modify it.

Returns

Python dictionary.

Expand source code

def get_config(self):
    config = super(CNNEmbedding, self).get_config()
    config.update({name: getattr(self, name)
                  for name in ["num_layers", "layer_size", "out_dim", "activation"]})
    return config

class ClassAttentionBlock (dim: int, heads: int, expansion: int, dropout: Optional[float] = None)

Class attention block. It allows the class token to attend to the input particles, and then feed the attended class token to the feed-forward network with residual connections and layer normalizations.

This extracts the class information from the attented particles more effectively.

Args

dim : int: dimension of the input and output
heads : int: number of heads
dropout : float, optional: dropout rate. Defaults to None.
expansion : int: expansion factor of the hidden layer, i.e. the hidden layer has size dim * expansion

Expand source code

class ClassAttentionBlock(tf.keras.layers.Layer):
    """Class attention block.
    It allows the class token to attend to the input particles, and then feed the attended class token
    to the feed-forward network with residual connections and layer normalizations.

    This extracts the class information from the attented particles more effectively.

    Args:
        dim (int): dimension of the input and output
        heads (int): number of heads
        dropout (float, optional): dropout rate. Defaults to None.
        expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion`
    """

    def __init__(self, dim: int, heads: int, expansion: int, dropout: Optional[float] = None):
        super().__init__()
        self.dim, self.heads, self.dropout, self.expansion = dim, heads, dropout, expansion

        self.pre_mhca_ln = tf.keras.layers.LayerNormalization()
        self.mhca = MultiheadClassAttention(dim=dim, heads=heads, dropout=dropout)
        self.post_mhca_ln = tf.keras.layers.LayerNormalization()
        self.mhca_dropout = tf.keras.layers.Dropout(dropout)

        self.pre_ffn_ln = tf.keras.layers.LayerNormalization()
        self.ffn = FFN(dim=dim, expansion=expansion, activation=tf.nn.gelu, dropout=dropout)

    def get_config(self):
        config = super().get_config()
        config.update({"dim": self.dim, "heads": self.heads, "dropout": self.dropout})
        return config

    def call(self, inputs: tf.Tensor, class_token: tf.Tensor, mask: tf.Tensor) -> tf.Tensor:
        """Forward pass of the class attention block

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
            class_token (tf.Tensor): class token tensor of shape `(batch_size, 1, dim)`
                It is concatenated with the input tensor along the particle dimension,
                at the front of the input tensor.
            mask (tf.Tensor): mask tensor of shape `(batch_size, 1, num_particles)`
                This mask is used to mask out the attention of padding particles, generated when
                tf.RaggedTensor is converted to tf.Tensor.

        Returns:
            tf.Tensor: output tensor of shape `(batch_size, 1, dim)`, an updated class token
        """
        attented = tf.concat([class_token, inputs], axis=1)
        attented = self.pre_mhca_ln(attented)
        attented = self.mhca(inputs=attented, class_token=class_token, mask=mask)
        attented = self.post_mhca_ln(attented)
        attented = self.mhca_dropout(attented)
        attented = attented + class_token

        ffned = self.pre_ffn_ln(attented)
        ffned = self.ffn(ffned)
        output = ffned + attented
        return output

Ancestors

keras.engine.base_layer.Layer
tensorflow.python.module.module.Module
tensorflow.python.trackable.autotrackable.AutoTrackable
tensorflow.python.trackable.base.Trackable
keras.utils.version_utils.LayerVersionSelector

Methods

def call(self, inputs: tensorflow.python.framework.ops.Tensor, class_token: tensorflow.python.framework.ops.Tensor, mask: tensorflow.python.framework.ops.Tensor) ‑> tensorflow.python.framework.ops.Tensor

Forward pass of the class attention block

Args

inputs : tf.Tensor: input tensor of shape (batch_size, num_particles, dim)
class_token : tf.Tensor: class token tensor of shape (batch_size, 1, dim) It is concatenated with the input tensor along the particle dimension, at the front of the input tensor.
mask : tf.Tensor: mask tensor of shape (batch_size, 1, num_particles) This mask is used to mask out the attention of padding particles, generated when tf.RaggedTensor is converted to tf.Tensor.

Returns

tf.Tensor: output tensor of shape (batch_size, 1, dim), an updated class token

Expand source code

def call(self, inputs: tf.Tensor, class_token: tf.Tensor, mask: tf.Tensor) -> tf.Tensor:
    """Forward pass of the class attention block

    Args:
        inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
        class_token (tf.Tensor): class token tensor of shape `(batch_size, 1, dim)`
            It is concatenated with the input tensor along the particle dimension,
            at the front of the input tensor.
        mask (tf.Tensor): mask tensor of shape `(batch_size, 1, num_particles)`
            This mask is used to mask out the attention of padding particles, generated when
            tf.RaggedTensor is converted to tf.Tensor.

    Returns:
        tf.Tensor: output tensor of shape `(batch_size, 1, dim)`, an updated class token
    """
    attented = tf.concat([class_token, inputs], axis=1)
    attented = self.pre_mhca_ln(attented)
    attented = self.mhca(inputs=attented, class_token=class_token, mask=mask)
    attented = self.post_mhca_ln(attented)
    attented = self.mhca_dropout(attented)
    attented = attented + class_token

    ffned = self.pre_ffn_ln(attented)
    ffned = self.ffn(ffned)
    output = ffned + attented
    return output

def get_config(self)

Returns the config of the layer.

A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration.

The config of a layer does not include connectivity information, nor the layer class name. These are handled by Network (one layer of abstraction above).

Note that get_config() does not guarantee to return a fresh copy of dict every time it is called. The callers should make a copy of the returned dict if they want to modify it.

Returns

Python dictionary.

Expand source code

def get_config(self):
    config = super().get_config()
    config.update({"dim": self.dim, "heads": self.heads, "dropout": self.dropout})
    return config

class FCEmbedding (embedding_dim: int, num_embeding_layers: int, activation: Callable[[tensorflow.python.framework.ops.Tensor], tensorflow.python.framework.ops.Tensor])

Embedding layer as a series of fully-connected layers.

Args

embed_dim : int: dimension of the embedding
embed_layers : int: number of fully-connected layers

activation (Callable[[tf.Tensor], tf.Tensor]) activation function

Expand source code

class FCEmbedding(tf.keras.layers.Layer):
    """Embedding layer as a series of fully-connected layers.

    Args:
        embed_dim (int): dimension of the embedding
        embed_layers (int): number of fully-connected layers
        activation (Callable[[tf.Tensor], tf.Tensor]) activation function
    """

    def __init__(self, embedding_dim: int, num_embeding_layers: int, activation: Callable[[tf.Tensor], tf.Tensor], ):

        super().__init__()
        self.embedding_dim, self.activation, self.num_embeding_layers = embedding_dim, activation, num_embeding_layers
        self.layers = [tf.keras.layers.Dense(self.embedding_dim, activation=self.activation)
                       for _ in range(self.num_embeding_layers)]

    def get_config(self):
        config = super(FCEmbedding, self).get_config()
        config.update({name: getattr(self, name) for name in ["embedding_dim", "num_embeding_layers", "activation"]})
        return config

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        """Forward pass of the embedding layer

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, num_features)`

        Returns:
            tf.Tensor: output tensor of shape `(batch_size, num_particles, embed_dim)`
        """
        hidden = inputs
        for layer in self.layers:
            hidden = layer(hidden)
        return hidden

Ancestors

keras.engine.base_layer.Layer
tensorflow.python.module.module.Module
tensorflow.python.trackable.autotrackable.AutoTrackable
tensorflow.python.trackable.base.Trackable
keras.utils.version_utils.LayerVersionSelector

Methods

def call(self, inputs: tensorflow.python.framework.ops.Tensor) ‑> tensorflow.python.framework.ops.Tensor

Forward pass of the embedding layer

Args

inputs : tf.Tensor: input tensor of shape (batch_size, num_particles, num_features)

Returns

tf.Tensor: output tensor of shape (batch_size, num_particles, embed_dim)

Expand source code

def call(self, inputs: tf.Tensor) -> tf.Tensor:
    """Forward pass of the embedding layer

    Args:
        inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, num_features)`

    Returns:
        tf.Tensor: output tensor of shape `(batch_size, num_particles, embed_dim)`
    """
    hidden = inputs
    for layer in self.layers:
        hidden = layer(hidden)
    return hidden

def get_config(self)

Returns the config of the layer.

A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration.

The config of a layer does not include connectivity information, nor the layer class name. These are handled by Network (one layer of abstraction above).

Note that get_config() does not guarantee to return a fresh copy of dict every time it is called. The callers should make a copy of the returned dict if they want to modify it.

Returns

Python dictionary.

Expand source code

def get_config(self):
    config = super(FCEmbedding, self).get_config()
    config.update({name: getattr(self, name) for name in ["embedding_dim", "num_embeding_layers", "activation"]})
    return config

class FFN (dim: int, expansion: int, activation: Callable[[tensorflow.python.framework.ops.Tensor], tensorflow.python.framework.ops.Tensor], dropout: Optional[float] = None)

Feed-forward network On top of the Transformer FFN layer, it adds a layer normalization in between the two dense layers.

Args

dim : int: dimension of the input and output
expansion : int: expansion factor of the hidden layer, i.e. the hidden layer has size dim * expansion
activation (Callable[[tf.Tensor], tf.Tensor]) activation function
dropout : float, optional: dropout rate. Defaults to None.

Expand source code

class FFN(tf.keras.layers.Layer):
    """Feed-forward network
    On top of the Transformer FFN layer, it adds a layer normalization in between the two dense layers.

    Args:
        dim (int): dimension of the input and output
        expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion`
        activation (Callable[[tf.Tensor], tf.Tensor]) activation function
        dropout (float, optional): dropout rate. Defaults to None.
    """

    def __init__(self, dim: int, expansion: int, activation: Callable[[tf.Tensor], tf.Tensor], dropout: Optional[float] = None):
        super().__init__()
        self.dim, self.expansion, self.activation, self.dropout = dim, expansion, activation, dropout

        self.wide_dense = tf.keras.layers.Dense(dim * expansion, activation=activation)
        self.dense = tf.keras.layers.Dense(dim, activation=None)
        self.ln = tf.keras.layers.LayerNormalization()
        self.layer_dropout = tf.keras.layers.Dropout(dropout)

    def get_config(self):
        config = super(FFN, self).get_config()
        config.update({"dim": self.dim, "expansion": self.expansion,
                      "activation": self.activation, "dropout": self.dropout})
        return config

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        """Forward pass of the feed-forward network
        Includes a layer normalization layer in between the two dense layers

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`

        Returns:
            tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
        """
        output = self.wide_dense(inputs)
        output = self.ln(output)
        output = self.dense(output)
        output = self.layer_dropout(output)
        return output

Ancestors

keras.engine.base_layer.Layer
tensorflow.python.module.module.Module
tensorflow.python.trackable.autotrackable.AutoTrackable
tensorflow.python.trackable.base.Trackable
keras.utils.version_utils.LayerVersionSelector

Methods

def call(self, inputs: tensorflow.python.framework.ops.Tensor) ‑> tensorflow.python.framework.ops.Tensor

Forward pass of the feed-forward network Includes a layer normalization layer in between the two dense layers

Args

inputs : tf.Tensor: input tensor of shape (batch_size, num_particles, dim)

Returns

tf.Tensor: output tensor of shape (batch_size, num_particles, dim)

Expand source code

def call(self, inputs: tf.Tensor) -> tf.Tensor:
    """Forward pass of the feed-forward network
    Includes a layer normalization layer in between the two dense layers

    Args:
        inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`

    Returns:
        tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
    """
    output = self.wide_dense(inputs)
    output = self.ln(output)
    output = self.dense(output)
    output = self.layer_dropout(output)
    return output

def get_config(self)

Returns the config of the layer.

A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration.

The config of a layer does not include connectivity information, nor the layer class name. These are handled by Network (one layer of abstraction above).

Note that get_config() does not guarantee to return a fresh copy of dict every time it is called. The callers should make a copy of the returned dict if they want to modify it.

Returns

Python dictionary.

Expand source code

def get_config(self):
    config = super(FFN, self).get_config()
    config.update({"dim": self.dim, "expansion": self.expansion,
                  "activation": self.activation, "dropout": self.dropout})
    return config

class MultiheadClassAttention (dim: int, heads: int, dropout: Optional[float] = None)

Multi-head class attention layer This layer is a wrapper around the tf.keras.layers.MultiHeadAttention layer, to fix the key, and value to be the same as the input, and only use the class token as the query.

Args

dim : int: dimension of the input and output
heads : int: number of heads
dropout : float, optional: dropout rate, defaults to None

Expand source code

class MultiheadClassAttention(tf.keras.layers.Layer):
    """Multi-head class attention layer
    This layer is a wrapper around the `tf.keras.layers.MultiHeadAttention` layer, 
    to fix the key, and value to be the same as the input, and only use the class token
    as the query.

    Args:
        dim (int): dimension of the input and output
        heads (int): number of heads
        dropout (float, optional): dropout rate, defaults to None
    """

    def __init__(self, dim: int, heads: int, dropout: Optional[float] = None):
        super().__init__()
        self.dim, self.heads, self.dropout = dim, heads, dropout
        self.mha = tf.keras.layers.MultiHeadAttention(key_dim=dim // heads, num_heads=heads)
        self.layer_dropout = tf.keras.layers.Dropout(dropout)

    def get_config(self):
        config = super(MultiheadClassAttention, self).get_config()
        config.update({"dim": self.dim, "heads": self.heads, "dropout": self.dropout})
        return config

    def call(self, inputs: tf.Tensor, class_token: tf.Tensor, mask: tf.Tensor) -> tf.Tensor:
        """Forward pass of the multi-head self-attention layer

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
            class_token (tf.Tensor): class token tensor of shape `(batch_size, 1, dim)`
            mask (tf.Tensor): mask tensor of shape `(batch_size, 1, num_particles)`
                This mask is used to mask out the attention of padding particles, generated when
                tf.RaggedTensor is converted to tf.Tensor.
        Returns:
            tf.Tensor: output tensor of shape `(batch_size, 1, dim)`
        """
        output = self.mha(query=class_token, value=inputs, key=inputs, attention_mask=mask)
        output = self.layer_dropout(output)
        return output

Ancestors

keras.engine.base_layer.Layer
tensorflow.python.module.module.Module
tensorflow.python.trackable.autotrackable.AutoTrackable
tensorflow.python.trackable.base.Trackable
keras.utils.version_utils.LayerVersionSelector

Methods

Forward pass of the multi-head self-attention layer

Args

inputs : tf.Tensor: input tensor of shape (batch_size, num_particles, dim)
class_token : tf.Tensor: class token tensor of shape (batch_size, 1, dim)
mask : tf.Tensor: mask tensor of shape (batch_size, 1, num_particles) This mask is used to mask out the attention of padding particles, generated when tf.RaggedTensor is converted to tf.Tensor.

Returns

tf.Tensor: output tensor of shape (batch_size, 1, dim)

Expand source code

def call(self, inputs: tf.Tensor, class_token: tf.Tensor, mask: tf.Tensor) -> tf.Tensor:
    """Forward pass of the multi-head self-attention layer

    Args:
        inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
        class_token (tf.Tensor): class token tensor of shape `(batch_size, 1, dim)`
        mask (tf.Tensor): mask tensor of shape `(batch_size, 1, num_particles)`
            This mask is used to mask out the attention of padding particles, generated when
            tf.RaggedTensor is converted to tf.Tensor.
    Returns:
        tf.Tensor: output tensor of shape `(batch_size, 1, dim)`
    """
    output = self.mha(query=class_token, value=inputs, key=inputs, attention_mask=mask)
    output = self.layer_dropout(output)
    return output

def get_config(self)

Returns the config of the layer.

A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration.

The config of a layer does not include connectivity information, nor the layer class name. These are handled by Network (one layer of abstraction above).

Note that get_config() does not guarantee to return a fresh copy of dict every time it is called. The callers should make a copy of the returned dict if they want to modify it.

Returns

Python dictionary.

Expand source code

def get_config(self):
    config = super(MultiheadClassAttention, self).get_config()
    config.update({"dim": self.dim, "heads": self.heads, "dropout": self.dropout})
    return config

class MultiheadSelfAttention (dim: int, heads: int)

Multi-head self-attention layer Standalone implementation of the multi-head self-attention layer, which includes the interaction variables.

Args

dim : int: dimension of the input and output
heads : int: number of heads

Expand source code

class MultiheadSelfAttention(tf.keras.layers.Layer):
    """Multi-head self-attention layer
    Standalone implementation of the multi-head self-attention layer, which
    includes the interaction variables.

    Args:
        dim (int): dimension of the input and output
        heads (int): number of heads

    """

    def __init__(self, dim: int, heads: int, ):
        super().__init__()
        self.dim, self.heads = dim, heads

        self.linear_qkv = tf.keras.layers.Dense(dim * 3)
        self.linear_out = tf.keras.layers.Dense(dim)

    def get_config(self):
        config = super(MultiheadSelfAttention, self).get_config()
        config.update({"dim": self.dim, "heads": self.heads})
        return config

    def call(self, inputs: tf.Tensor, mask: tf.Tensor, interaction: Optional[tf.Tensor] = None) -> tf.Tensor:
        """Forward pass of the multi-head self-attention layer

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
            mask (tf.Tensor): mask tensor of shape `(batch_size, num_particles, num_particles)`
                This mask is used to mask out the attention of padding particles, generated when
                tf.RaggedTensor is converted to tf.Tensor.
            interaction (tf.Tensor, optional): interaction tensor of shape `(batch_size, num_particles, num_particles, heads)`

        Returns:
            tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
        """
        B, N, C = tf.shape(inputs)[0], tf.shape(inputs)[1], tf.shape(inputs)[2]

        qkv = self.linear_qkv(inputs)  # (B, N, C * 3)
        qkv = tf.reshape(qkv, [B, N, 3, self.heads, C // self.heads])  # (B, N, 3, H, C // H)
        qkv = tf.transpose(qkv, [2, 0, 3, 1, 4])  # (3, B, H, N, C // H)
        q, k, v = qkv[0], qkv[1], qkv[2]  # 3 x (B, H, N, C // H)

        attention_weights = tf.linalg.matmul(q, k, transpose_b=True) / (q.shape[-1] ** 0.5)  # (B, H, N, N)

        if interaction is not None:
            interaction = tf.transpose(interaction, [0, 3, 1, 2])  # (B, H, N, N)
            attention_weights += interaction

        attention = tf.keras.layers.Softmax()(attention_weights, mask=mask)  # (B, H, N, N)

        output = tf.linalg.matmul(attention, v)  # (B, H, N, C // H)
        output = tf.transpose(output, [0, 2, 1, 3])  # (B, N, H, C // H)
        output = tf.reshape(output, [B, N, C])  # (B, N, C)
        output = self.linear_out(output)  # (B, N, C)
        return output

Ancestors

keras.engine.base_layer.Layer
tensorflow.python.module.module.Module
tensorflow.python.trackable.autotrackable.AutoTrackable
tensorflow.python.trackable.base.Trackable
keras.utils.version_utils.LayerVersionSelector

Methods

def call(self, inputs: tensorflow.python.framework.ops.Tensor, mask: tensorflow.python.framework.ops.Tensor, interaction: Optional[tensorflow.python.framework.ops.Tensor] = None) ‑> tensorflow.python.framework.ops.Tensor

Forward pass of the multi-head self-attention layer

Args

inputs : tf.Tensor: input tensor of shape (batch_size, num_particles, dim)
mask : tf.Tensor: mask tensor of shape (batch_size, num_particles, num_particles) This mask is used to mask out the attention of padding particles, generated when tf.RaggedTensor is converted to tf.Tensor.
interaction : tf.Tensor, optional: interaction tensor of shape (batch_size, num_particles, num_particles, heads)

Returns

tf.Tensor: output tensor of shape (batch_size, num_particles, dim)

Expand source code

def call(self, inputs: tf.Tensor, mask: tf.Tensor, interaction: Optional[tf.Tensor] = None) -> tf.Tensor:
    """Forward pass of the multi-head self-attention layer

    Args:
        inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
        mask (tf.Tensor): mask tensor of shape `(batch_size, num_particles, num_particles)`
            This mask is used to mask out the attention of padding particles, generated when
            tf.RaggedTensor is converted to tf.Tensor.
        interaction (tf.Tensor, optional): interaction tensor of shape `(batch_size, num_particles, num_particles, heads)`

    Returns:
        tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
    """
    B, N, C = tf.shape(inputs)[0], tf.shape(inputs)[1], tf.shape(inputs)[2]

    qkv = self.linear_qkv(inputs)  # (B, N, C * 3)
    qkv = tf.reshape(qkv, [B, N, 3, self.heads, C // self.heads])  # (B, N, 3, H, C // H)
    qkv = tf.transpose(qkv, [2, 0, 3, 1, 4])  # (3, B, H, N, C // H)
    q, k, v = qkv[0], qkv[1], qkv[2]  # 3 x (B, H, N, C // H)

    attention_weights = tf.linalg.matmul(q, k, transpose_b=True) / (q.shape[-1] ** 0.5)  # (B, H, N, N)

    if interaction is not None:
        interaction = tf.transpose(interaction, [0, 3, 1, 2])  # (B, H, N, N)
        attention_weights += interaction

    attention = tf.keras.layers.Softmax()(attention_weights, mask=mask)  # (B, H, N, N)

    output = tf.linalg.matmul(attention, v)  # (B, H, N, C // H)
    output = tf.transpose(output, [0, 2, 1, 3])  # (B, N, H, C // H)
    output = tf.reshape(output, [B, N, C])  # (B, N, C)
    output = self.linear_out(output)  # (B, N, C)
    return output

def get_config(self)

Returns the config of the layer.

A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration.

The config of a layer does not include connectivity information, nor the layer class name. These are handled by Network (one layer of abstraction above).

Note that get_config() does not guarantee to return a fresh copy of dict every time it is called. The callers should make a copy of the returned dict if they want to modify it.

Returns

Python dictionary.

Expand source code

def get_config(self):
    config = super(MultiheadSelfAttention, self).get_config()
    config.update({"dim": self.dim, "heads": self.heads})
    return config

class ParT (dim: int, self_attn_layers: int, class_attn_layers: int, expansion: int, heads: int, activation: Callable[[tensorflow.python.framework.ops.Tensor], tensorflow.python.framework.ops.Tensor], dropout: Optional[float] = None)

Pure Particle Transformer (ParT) layers without the embedding and output layers.

It also creates the class token, which is used to encode the global information of the input, using the ClassAttentionBlock.

Args

dim : int: dimension of the input and output
self_attn_layers : int: number of self-attention layers
class_attn_layers : int: number of class-attention layers
expansion : int: expansion factor of the hidden layer, i.e. the hidden layer has size dim * expansion
heads : int: number of heads
activation (Callable[[tf.Tensor], tf.Tensor]) activation function
dropout : float, optional: dropout rate. Defaults to None.

Expand source code

class ParT(tf.keras.layers.Layer):
    """Pure Particle Transformer (ParT) layers without the embedding and output layers.

    It also creates the class token, which is used to encode the global information of the input,
    using the ClassAttentionBlock.

    Args:
        dim (int): dimension of the input and output
        self_attn_layers (int): number of self-attention layers
        class_attn_layers (int): number of class-attention layers
        expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion`
        heads (int): number of heads
        activation (Callable[[tf.Tensor], tf.Tensor]) activation function
        dropout (float, optional): dropout rate. Defaults to None.
    """

    def __init__(self,
                 dim: int,
                 self_attn_layers: int,
                 class_attn_layers: int,
                 expansion: int,
                 heads: int,
                 activation: Callable[[tf.Tensor], tf.Tensor],
                 dropout: Optional[float] = None):
        # Make sure `dim` is even.
        assert dim % 2 == 0

        super().__init__()
        self.dim, self.expansion, self.heads, self.dropout, self.activation, self.num_selfattn_layers, self.num_class_layers = dim, expansion, heads, dropout, activation, self_attn_layers, class_attn_layers

        self.class_token = tf.Variable(tf.random.truncated_normal((1, 1, dim), stddev=0.02), trainable=True)
        self.sa_layers = [SelfAttentionBlock(dim, heads, expansion, activation, dropout)
                          for _ in range(self_attn_layers)]
        self.ca_layers = [ClassAttentionBlock(dim, heads, expansion, dropout) for _ in range(class_attn_layers)]

    def get_config(self):
        config = super(ParT, self).get_config()
        config.update({name: getattr(self, name)
                      for name in ["dim", "expansion", "heads", "dropout", "activation", "num_selfattn_layers", "num_class_layers"]})
        return config

    def call(self, inputs: tf.Tensor, mask: tf.Tensor, interaction: Optional[tf.Tensor] = None) -> tf.Tensor:
        """Forward pass of the ParT layers

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
            mask (tf.Tensor): mask tensor of shape `(batch_size, num_particles)`.
                From the mask, a mask tensor of shape `(batch_size, num_particles, num_particles)`
                is calculated, which is used to mask out the attention of padding particles, generated when
                `tf.RaggedTensor` is converted to `tf.Tensor`.
            interaction (tf.Tensor, optional): interaction tensor of shape `(batch_size, num_particles, num_particles, heads)`

        Returns:
            tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
        """
        sa_mask = mask[:, tf.newaxis, tf.newaxis, :] & mask[:, tf.newaxis, :, tf.newaxis]
        hidden = inputs
        for layer in self.sa_layers:
            hidden = layer(hidden, sa_mask, interaction)

        class_token = tf.tile(self.class_token, [tf.shape(inputs)[0], 1, 1])
        class_mask = mask[:, tf.newaxis, :]
        class_mask = tf.concat([tf.ones((tf.shape(inputs)[0], 1, 1), dtype=tf.bool), class_mask], axis=2)
        for layer in self.ca_layers:
            class_token = layer(hidden, class_token, class_mask)
        return class_token

Ancestors

keras.engine.base_layer.Layer
tensorflow.python.module.module.Module
tensorflow.python.trackable.autotrackable.AutoTrackable
tensorflow.python.trackable.base.Trackable
keras.utils.version_utils.LayerVersionSelector

Methods

Forward pass of the ParT layers

Args

inputs : tf.Tensor: input tensor of shape (batch_size, num_particles, dim)
mask : tf.Tensor: mask tensor of shape (batch_size, num_particles). From the mask, a mask tensor of shape (batch_size, num_particles, num_particles) is calculated, which is used to mask out the attention of padding particles, generated when tf.RaggedTensor is converted to tf.Tensor.
interaction : tf.Tensor, optional: interaction tensor of shape (batch_size, num_particles, num_particles, heads)

Returns

tf.Tensor: output tensor of shape (batch_size, num_particles, dim)

Expand source code

def call(self, inputs: tf.Tensor, mask: tf.Tensor, interaction: Optional[tf.Tensor] = None) -> tf.Tensor:
    """Forward pass of the ParT layers

    Args:
        inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
        mask (tf.Tensor): mask tensor of shape `(batch_size, num_particles)`.
            From the mask, a mask tensor of shape `(batch_size, num_particles, num_particles)`
            is calculated, which is used to mask out the attention of padding particles, generated when
            `tf.RaggedTensor` is converted to `tf.Tensor`.
        interaction (tf.Tensor, optional): interaction tensor of shape `(batch_size, num_particles, num_particles, heads)`

    Returns:
        tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
    """
    sa_mask = mask[:, tf.newaxis, tf.newaxis, :] & mask[:, tf.newaxis, :, tf.newaxis]
    hidden = inputs
    for layer in self.sa_layers:
        hidden = layer(hidden, sa_mask, interaction)

    class_token = tf.tile(self.class_token, [tf.shape(inputs)[0], 1, 1])
    class_mask = mask[:, tf.newaxis, :]
    class_mask = tf.concat([tf.ones((tf.shape(inputs)[0], 1, 1), dtype=tf.bool), class_mask], axis=2)
    for layer in self.ca_layers:
        class_token = layer(hidden, class_token, class_mask)
    return class_token

def get_config(self)

Returns the config of the layer.

A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration.

The config of a layer does not include connectivity information, nor the layer class name. These are handled by Network (one layer of abstraction above).

Note that get_config() does not guarantee to return a fresh copy of dict every time it is called. The callers should make a copy of the returned dict if they want to modify it.

Returns

Python dictionary.

Expand source code

def get_config(self):
    config = super(ParT, self).get_config()
    config.update({name: getattr(self, name)
                  for name in ["dim", "expansion", "heads", "dropout", "activation", "num_selfattn_layers", "num_class_layers"]})
    return config

class ParTModel (input_shape: Union[Tuple[None, int], Tuple[Tuple[None, int], Tuple[None, None, int]]], embed_dim: int, embed_layers: int, self_attn_layers: int, class_attn_layers: int, expansion: int, heads: int, output_layer: keras.engine.base_layer.Layer, activation: Callable[[tensorflow.python.framework.ops.Tensor], tensorflow.python.framework.ops.Tensor], dropout: Optional[float] = None, interaction_embed_layers: Optional[int] = None, interaction_embed_layer_size: Optional[int] = None, preprocess: Union[keras.engine.base_layer.Layer, ForwardRef(None), Tuple[keras.engine.base_layer.Layer, keras.engine.base_layer.Layer]] = None)

ParT model with embwith embedding and output layers.

The model already contains the tf.keras.layers.Input layer, so it can be used as a standalone model.

The input tensor can be either a tensor of shape (batch_size, num_particles, num_features) or a tuple of tensors (particle_tensor, interaction_tensor) of shapes (batch_size, num_particles, num_features) and (batch_size, num_particles, num_particles, num_features), respectively.

The model can be used with or without the interaction tensor, depending on the type of the input shape, if it is a tuple, the interaction tensor is assumed to be present.

The input tensor is first passed through the embedding layer, then the ParT layers, and finally the output layer. If the interaction tensor is present, it is passed through the interaction embedding layer before the ParT layers.

If the preprocessing layer is not None, the input tensor is first passed through the preprocessing layer before the embedding layer. If the interaction tensor is present, it is passed through the preprocessing layer is an tuple of two layers, each of which is applied to the particle and interaction tensors, respectively.

The output of ParT is a vector of shape (batch_size, embed_dim) with extracted class infromation. This is then passed through the output layer. Layer normalization is applied to the output of the ParT layers before the output layer.

Args

input_shape : Union[Tuple[None, int], Tuple[Tuple[None, int], Tuple[None, None, int]]]: shape of the input tensor. If the interaction tensor is present, it is assumed to be a tuple of two shapes, each creating a separate input layer.
embed_dim : int: dimension of the embedding layer
embed_layers : int: number of layers of the embedding layer
self_attn_layers : int: number of self-attention layers
class_attn_layers : int: number of class-attention layers
expansion : int: expansion factor of the self-attention layers
heads : int: number of heads of the self-attention layers
output_layer : tf.keras.layers.Layer: output layer
activation : Callable[[tf.Tensor], tf.Tensor]: activation function
dropout : Optional[float], optional: dropout rate. Defaults to None.
interaction_embed_layers : Optional[int], optional: number of layers of the interaction embedding layer. Defaults to None.
interaction_embed_layer_size : Optional[int], optional: size of the layers of the interaction embedding layer. Defaults to None.
preprocess : Union[tf.keras.layers.Layer, None, Tuple[tf.keras.layers.Layer, tf.keras.layers.Layer]], optional: preprocessing layer. Defaults to None.

Expand source code

class ParTModel(tf.keras.Model):
    """ParT model with embwith embedding and output layers.

    The model already contains the `tf.keras.layers.Input` layer, so it can be used as a standalone model.

    The input tensor can be either a tensor of shape `(batch_size, num_particles, num_features)` or
    a tuple of tensors `(particle_tensor, interaction_tensor)` of shapes 
    `(batch_size, num_particles, num_features)` and `(batch_size, num_particles, num_particles, num_features)`, respectively.

    The model can be used with or without the interaction tensor, depending on the type of the input shape,
    if it is a tuple, the interaction tensor is assumed to be present.

    The input tensor is first passed through the embedding layer, then the ParT layers, and finally the output layer.
    If the interaction tensor is present, it is passed through the interaction embedding layer before the ParT layers.

    If the preprocessing layer is not None, the input tensor is first passed through the preprocessing layer before the embedding layer.
    If the interaction tensor is present, it is passed through the preprocessing layer is an tuple of two layers,
    each of which is applied to the particle and interaction tensors, respectively.

    The output of ParT is a vector of shape `(batch_size, embed_dim)` with extracted class infromation.
    This is then passed through the output layer.
    Layer normalization is applied to the output of the ParT layers before the output layer.

    Args:
        input_shape (Union[Tuple[None, int], Tuple[Tuple[None, int], Tuple[None, None, int]]]): shape of the input tensor.
            If the interaction tensor is present, it is assumed to be a tuple of two shapes,
            each creating a separate input layer.
        embed_dim (int): dimension of the embedding layer
        embed_layers (int): number of layers of the embedding layer
        self_attn_layers (int): number of self-attention layers
        class_attn_layers (int): number of class-attention layers
        expansion (int): expansion factor of the self-attention layers
        heads (int): number of heads of the self-attention layers
        output_layer (tf.keras.layers.Layer): output layer
        activation (Callable[[tf.Tensor], tf.Tensor]): activation function
        dropout (Optional[float], optional): dropout rate. Defaults to None.
        interaction_embed_layers (Optional[int], optional): number of layers of the interaction embedding layer. Defaults to None.
        interaction_embed_layer_size (Optional[int], optional): size of the layers of the interaction embedding layer. Defaults to None.
        preprocess (Union[tf.keras.layers.Layer, None, Tuple[tf.keras.layers.Layer, tf.keras.layers.Layer]], optional): preprocessing layer. Defaults to None.

    """

    def __init__(self,
                 input_shape: Union[Tuple[None, int], Tuple[Tuple[None, int], Tuple[None, None, int]]],
                 embed_dim: int,
                 embed_layers: int,
                 self_attn_layers: int,
                 class_attn_layers: int,
                 expansion: int,
                 heads: int,
                 output_layer: tf.keras.layers.Layer,
                 activation: Callable[[tf.Tensor], tf.Tensor],
                 dropout: Optional[float] = None,
                 interaction_embed_layers: Optional[int] = None,
                 interaction_embed_layer_size: Optional[int] = None,
                 preprocess: Union[tf.keras.layers.Layer, None, Tuple[tf.keras.layers.Layer, tf.keras.layers.Layer]] = None):

        if isinstance(input_shape, tuple) and isinstance(input_shape[0], tuple):
            input = (tf.keras.layers.Input(shape=input_shape[0], ragged=True),
                     tf.keras.layers.Input(shape=input_shape[1], ragged=True))
            row_lengths = input[0].row_lengths()
            hidden = input[0].to_tensor()
            interaction_hidden = input[1].to_tensor()

            if preprocess is not None:
                if not isinstance(preprocess, tuple):
                    raise ValueError(
                        "preprocess must be a tuple of two layers when the input is a tuple of two tensors.")

                preprocess, interaction_preprocess = preprocess
                if interaction_preprocess is not None:
                    interaction_hidden = interaction_preprocess(interaction_hidden)

            if interaction_embed_layers is None or interaction_embed_layer_size is None:
                raise ValueError(
                    """interaction_embed_layers and interaction_embed_layer_size must be specified 
                    when the input is a tuple of two tensors, i.e. the interaction variables are used.""")

            embed_interaction = CNNEmbedding(
                interaction_embed_layers,
                interaction_embed_layer_size,
                heads,
                activation)(interaction_hidden)
        else:
            input = tf.keras.layers.Input(shape=input_shape, ragged=True)
            embed_interaction = None
            row_lengths = input.row_lengths()
            hidden = input.to_tensor()

        if preprocess is not None:
            if isinstance(preprocess, tuple):
                raise ValueError("preprocess must be a single layer when the input is a single tensor.")
            hidden = preprocess(hidden)

        hidden = FCEmbedding(embed_dim, embed_layers, activation)(hidden)

        transformed = ParT(dim=embed_dim,
                           self_attn_layers=self_attn_layers,
                           class_attn_layers=class_attn_layers,
                           expansion=expansion,
                           heads=heads,
                           dropout=dropout,
                           activation=activation)(hidden, tf.sequence_mask(row_lengths), embed_interaction)

        transformed = tf.keras.layers.LayerNormalization()(transformed)
        output = output_layer(transformed[:, 0, :])

        super().__init__(inputs=input, outputs=output)

Ancestors

keras.engine.training.Model
keras.engine.base_layer.Layer
tensorflow.python.module.module.Module
tensorflow.python.trackable.autotrackable.AutoTrackable
tensorflow.python.trackable.base.Trackable
keras.utils.version_utils.LayerVersionSelector
keras.utils.version_utils.ModelVersionSelector

class SelfAttentionBlock (dim: int, heads: int, expansion: int, activation: Callable[[tensorflow.python.framework.ops.Tensor], tensorflow.python.framework.ops.Tensor], dropout: Optional[float] = None)

Self-attention block. It contains a multi-head self-attention layer and a feed-forward network with residual connections and layer normalizations. The self-attention layer includes the interaction variables.

Args

dim : int: dimension of the input and output
heads : int: number of heads
expansion : int: expansion factor of the hidden layer, i.e. the hidden layer has size dim * expansion
activation (Callable[[tf.Tensor], tf.Tensor]) activation function
dropout : float, optional: dropout rate. Defaults to None.

Expand source code

class SelfAttentionBlock(tf.keras.layers.Layer):
    """Self-attention block.
    It contains a multi-head self-attention layer and a feed-forward network with residual connections
    and layer normalizations. The self-attention layer includes the interaction variables.

    Args:
        dim (int): dimension of the input and output
        heads (int): number of heads
        expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion`
        activation (Callable[[tf.Tensor], tf.Tensor]) activation function
        dropout (float, optional): dropout rate. Defaults to None.
    """

    def __init__(self, dim: int, heads: int, expansion: int, activation: Callable[[tf.Tensor], tf.Tensor], dropout: Optional[float] = None):
        super().__init__()
        self.dim, self.heads, self.dropout, self.expansion, self.activation = dim, heads, dropout, expansion, activation

        self.pre_mhsa_ln = tf.keras.layers.LayerNormalization()
        self.mhsa = MultiheadSelfAttention(dim=dim, heads=heads)
        self.post_mhsa_ln = tf.keras.layers.LayerNormalization()
        self.mhsa_dropout = tf.keras.layers.Dropout(dropout)

        self.pre_ffn_ln = tf.keras.layers.LayerNormalization()
        self.ffn = FFN(dim=dim, expansion=expansion, activation=activation, dropout=dropout)

    def get_config(self):
        config = super().get_config()
        config.update({"dim": self.dim, "heads": self.heads, "dropout": self.dropout,
                      "expansion": self.expansion, "activation": self.activation})
        return config

    def call(self, inputs: tf.Tensor, mask: tf.Tensor, interaction: Optional[tf.Tensor] = None) -> tf.Tensor:
        """Forward pass of the self-attention block

        Args:
            inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
            mask (tf.Tensor, optional): mask tensor of shape `(batch_size, num_particles, num_particles)`. Defaults to None.
                This mask is used to mask out the attention of padding particles, generated when
                tf.RaggedTensor is converted to tf.Tensor.
            interaction (tf.Tensor, optional): interaction tensor of shape `(batch_size, num_particles, num_particles, heads)`. Defaults to None.

        Returns:
            tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
        """
        attented = self.pre_mhsa_ln(inputs)
        attented = self.mhsa(inputs=attented, mask=mask, interaction=interaction)
        attented = self.post_mhsa_ln(attented)
        attented = self.mhsa_dropout(attented)
        attented = attented + inputs

        ffned = self.pre_ffn_ln(attented)
        ffned = self.ffn(ffned)
        output = ffned + attented

        return output

Ancestors

keras.engine.base_layer.Layer
tensorflow.python.module.module.Module
tensorflow.python.trackable.autotrackable.AutoTrackable
tensorflow.python.trackable.base.Trackable
keras.utils.version_utils.LayerVersionSelector

Methods

Forward pass of the self-attention block

Args

inputs : tf.Tensor: input tensor of shape (batch_size, num_particles, dim)
mask : tf.Tensor, optional: mask tensor of shape (batch_size, num_particles, num_particles). Defaults to None. This mask is used to mask out the attention of padding particles, generated when tf.RaggedTensor is converted to tf.Tensor.
interaction : tf.Tensor, optional: interaction tensor of shape (batch_size, num_particles, num_particles, heads). Defaults to None.

Returns

tf.Tensor: output tensor of shape (batch_size, num_particles, dim)

Expand source code

def call(self, inputs: tf.Tensor, mask: tf.Tensor, interaction: Optional[tf.Tensor] = None) -> tf.Tensor:
    """Forward pass of the self-attention block

    Args:
        inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
        mask (tf.Tensor, optional): mask tensor of shape `(batch_size, num_particles, num_particles)`. Defaults to None.
            This mask is used to mask out the attention of padding particles, generated when
            tf.RaggedTensor is converted to tf.Tensor.
        interaction (tf.Tensor, optional): interaction tensor of shape `(batch_size, num_particles, num_particles, heads)`. Defaults to None.

    Returns:
        tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
    """
    attented = self.pre_mhsa_ln(inputs)
    attented = self.mhsa(inputs=attented, mask=mask, interaction=interaction)
    attented = self.post_mhsa_ln(attented)
    attented = self.mhsa_dropout(attented)
    attented = attented + inputs

    ffned = self.pre_ffn_ln(attented)
    ffned = self.ffn(ffned)
    output = ffned + attented

    return output

def get_config(self)

Returns the config of the layer.

A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration.

The config of a layer does not include connectivity information, nor the layer class name. These are handled by Network (one layer of abstraction above).

Note that get_config() does not guarantee to return a fresh copy of dict every time it is called. The callers should make a copy of the returned dict if they want to modify it.

Returns

Python dictionary.

Expand source code

def get_config(self):
    config = super().get_config()
    config.update({"dim": self.dim, "heads": self.heads, "dropout": self.dropout,
                  "expansion": self.expansion, "activation": self.activation})
    return config