Module jidenn.models.Transformer
Implementation of the Transformer model from the paper "Attention is all you need," see https://arxiv.org/abs/1706.03762.
The model is a stack of self-attention blocks, each of which contains a multi-head self-attention layer and a feed-forward network.
The input features are embedded into a vector of size dim
, which is then passed through the self-attention blocks.
Expand source code
"""
Implementation of the Transformer model from the paper "Attention is all you need," see https://arxiv.org/abs/1706.03762.
The model is a stack of self-attention blocks, each of which contains a multi-head self-attention layer and a feed-forward network.
The input features are embedded into a vector of size `dim`, which is then passed through the self-attention blocks.
![Transformer](images/transformer.png)
![Transformer](images/transformer_layers.png)
"""
import tensorflow as tf
from typing import Callable, Tuple, Optional
class FFN(tf.keras.layers.Layer):
"""Feed-forward network
Args:
dim (int): dimension of the input and output
expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion`
activation (Callable[[tf.Tensor], tf.Tensor]) activation function
dropout (float, optional): dropout rate. Defaults to None.
"""
def __init__(self, dim: int, expansion: int, activation: Callable[[tf.Tensor], tf.Tensor], dropout: Optional[float] = None):
super().__init__()
self.dim, self.expansion, self.activation, self.dropout = dim, expansion, activation, dropout
self.wide_dense = tf.keras.layers.Dense(dim * expansion, activation=activation)
self.dense = tf.keras.layers.Dense(dim, activation=None)
self.layer_dropout = tf.keras.layers.Dropout(dropout)
def get_config(self):
config = super(FFN, self).get_config()
config.update({"dim": self.dim, "expansion": self.expansion,
"activation": self.activation, "dropout": self.dropout})
return config
def call(self, inputs: tf.Tensor) -> tf.Tensor:
"""Forward pass of the feed-forward network
Args:
inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
Returns:
tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
"""
output = self.wide_dense(inputs)
output = self.dense(output)
output = self.layer_dropout(output)
output = self.layer_dropout(output)
return output
class MultiheadSelfAttention(tf.keras.layers.Layer):
"""Multi-head self-attention layer
This layer is a wrapper around the `tf.keras.layers.MultiHeadAttention` layer,
to fix the key, value, and query to be the same.
Args:
dim (int): dimension of the input and output
heads (int): number of heads
"""
def __init__(self, dim: int, heads: int):
super().__init__()
self.dim, self.heads = dim, heads
self.mha = tf.keras.layers.MultiHeadAttention(key_dim=dim // heads, num_heads=heads)
def get_config(self):
config = super(MultiheadSelfAttention, self).get_config()
config.update({"dim": self.dim, "heads": self.heads})
return config
def call(self, inputs: tf.Tensor, mask: tf.Tensor) -> tf.Tensor:
"""Forward pass of the multi-head self-attention layer
Args:
inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
mask (tf.Tensor): mask tensor of shape `(batch_size, num_particles, num_particles)`
This mask is used to mask out the attention of padding particles, generated when
tf.RaggedTensor is converted to tf.Tensor.
Returns:
tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
"""
output = self.mha(query=inputs, value=inputs, key=inputs, attention_mask=mask)
return output
class SelfAttentionBlock(tf.keras.layers.Layer):
"""Self-attention block.
It contains a multi-head self-attention layer and a feed-forward network with residual connections
and layer normalizations.
Args:
dim (int): dimension of the input and output
heads (int): number of heads
expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion`
activation (Callable[[tf.Tensor], tf.Tensor]) activation function
dropout (float, optional): dropout rate. Defaults to None.
"""
def __init__(self, dim: int, heads: int, expansion: int, activation: Callable[[tf.Tensor], tf.Tensor], dropout: Optional[float] = None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.dim, self.heads, self.dropout = dim, heads, dropout
self.expansion, self.activation = expansion, activation
self.mhsa_ln = tf.keras.layers.LayerNormalization()
self.mhsa = MultiheadSelfAttention(dim, heads)
self.mhsa_dropout = tf.keras.layers.Dropout(dropout)
self.ffn_ln = tf.keras.layers.LayerNormalization()
self.ffn = FFN(dim, expansion, activation, dropout)
def call(self, inputs: tf.Tensor, mask: tf.Tensor) -> tf.Tensor:
"""Forward pass of the self-attention block
Args:
inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
mask (tf.Tensor): mask tensor of shape `(batch_size, num_particles, num_particles)`
This mask is used to mask out the attention of padding particles, generated when
tf.RaggedTensor is converted to tf.Tensor.
Returns:
tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
"""
attented = self.mhsa_ln(inputs)
attented = self.mhsa(attented, mask)
attented = self.mhsa_dropout(attented)
attented = attented + inputs
ffned = self.ffn_ln(attented)
ffned = self.ffn(ffned)
output = ffned + attented
return output
class Transformer(tf.keras.layers.Layer):
"""Pure Transformer layers without embedding and output layers.
It also creates the class token, which is used to encode the global information of the input,
by concatenating the class token to the input.
Args:
layers (int): number of Self-Attention layers
dim (int): dimension of the input and output
expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion`
heads (int): number of heads
activation (Callable[[tf.Tensor], tf.Tensor]) activation function
dropout (float, optional): dropout rate. Defaults to None.
"""
def __init__(self, layers: int, dim: int, expansion: int, heads: int, activation: Callable[[tf.Tensor], tf.Tensor], dropout: Optional[float] = None):
# Make sure `dim` is even.
assert dim % 2 == 0
super().__init__()
self.layers, self.dim, self.expansion, self.heads, self.dropout, self.activation = layers, dim, expansion, heads, dropout, activation
self.class_token = tf.Variable(initial_value=tf.random.truncated_normal(
(1, 1, dim), stddev=0.02), trainable=True)
self.sa_layers = [SelfAttentionBlock(dim, heads, expansion, activation, dropout) for _ in range(layers)]
def get_config(self):
config = super(Transformer, self).get_config()
config.update({name: getattr(self, name)
for name in ["layers", "dim", "expansion", "heads", "dropout", "activation"]})
return config
def call(self, inputs: tf.Tensor, mask: tf.Tensor) -> tf.Tensor:
"""Forward pass of the Transformer layers
Args:
inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)`
mask (tf.Tensor): mask tensor of shape `(batch_size, num_particles)`.
From the mask, a mask tensor of shape `(batch_size, num_particles, num_particles)`
is calculated, which is used to mask out the attention of padding particles, generated when
`tf.RaggedTensor` is converted to `tf.Tensor`.
Returns:
tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)`
"""
mask = mask[:, tf.newaxis, :] & mask[:, :, tf.newaxis]
class_tokens = tf.tile(self.class_token, [tf.shape(inputs)[0], 1, 1])
hidden = tf.concat([class_tokens, inputs], axis=1)
for sa_block in self.sa_layers:
hidden = sa_block(hidden, mask)
return hidden
class FCEmbedding(tf.keras.layers.Layer):
"""Embedding layer as a series of fully-connected layers.
Args:
embed_dim (int): dimension of the embedding
embed_layers (int): number of fully-connected layers
activation (Callable[[tf.Tensor], tf.Tensor]) activation function
"""
def __init__(self, embed_dim: int, embed_layers: int, activation: Callable[[tf.Tensor], tf.Tensor]):
super().__init__()
self.embedding_dim, self.activation, self.num_embeding_layers = embed_dim, activation, embed_layers
self.layers = [tf.keras.layers.Dense(self.embedding_dim, activation=self.activation)
for _ in range(self.num_embeding_layers)]
def get_config(self):
config = super(FCEmbedding, self).get_config()
config.update({name: getattr(self, name) for name in ["embedding_dim", "num_embeding_layers", "activation"]})
return config
def call(self, inputs):
"""Forward pass of the embedding layer
Args:
inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, num_features)`
Returns:
tf.Tensor: output tensor of shape `(batch_size, num_particles, embed_dim)`
"""
hidden = inputs
for layer in self.layers:
hidden = layer(hidden)
return hidden
class TransformerModel(tf.keras.Model):
"""Transformer model with embedding and output layers.
The model already contains the `tf.keras.layers.Input` layer, so it can be used as a standalone model.
The input tensor is first passed through the embedding layer, then the Transformer layers, and finally the output layer.
If the preprocessing layer is not None, the input tensor is first passed through the preprocessing layer before the embedding layer.
The input to the output layer is the extracted class token.
Args:
input_shape (Tuple[int]): shape of the input
embed_dim (int): dimension of the embedding
embed_layers (int): number of fully-connected layers in the embedding
self_attn_layers (int): number of Self-Attention layers
expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion`
heads (int): number of heads
dropout (float, optional): dropout rate. Defaults to None.
output_layer (tf.keras.layers.Layer): output layer
activation (Callable[[tf.Tensor], tf.Tensor]) activation function used in all the layers
preprocess (tf.keras.layers.Layer, optional): preprocessing layer. Defaults to None.
"""
def __init__(self,
input_shape: Tuple[None, int],
embed_dim: int,
embed_layers: int,
self_attn_layers: int,
expansion: int,
heads: int,
dropout: float,
output_layer: tf.keras.layers.Layer,
activation: Callable[[tf.Tensor], tf.Tensor],
preprocess: Optional[tf.keras.layers.Layer] = None):
input = tf.keras.layers.Input(shape=input_shape, ragged=True)
row_lengths = input.row_lengths()
hidden = input.to_tensor()
if preprocess is not None:
hidden = preprocess(hidden)
hidden = FCEmbedding(embed_dim, embed_layers, activation)(hidden)
row_lengths += 1
transformed = Transformer(self_attn_layers, embed_dim, expansion,
heads, activation, dropout)(hidden, mask=tf.sequence_mask(row_lengths))
transformed = tf.keras.layers.LayerNormalization()(transformed[:, 0, :])
output = output_layer(transformed)
super().__init__(inputs=input, outputs=output)
Classes
class FCEmbedding (embed_dim: int, embed_layers: int, activation: Callable[[tensorflow.python.framework.ops.Tensor], tensorflow.python.framework.ops.Tensor])
-
Embedding layer as a series of fully-connected layers.
Args
embed_dim
:int
- dimension of the embedding
embed_layers
:int
- number of fully-connected layers
activation (Callable[[tf.Tensor], tf.Tensor]) activation function
Expand source code
class FCEmbedding(tf.keras.layers.Layer): """Embedding layer as a series of fully-connected layers. Args: embed_dim (int): dimension of the embedding embed_layers (int): number of fully-connected layers activation (Callable[[tf.Tensor], tf.Tensor]) activation function """ def __init__(self, embed_dim: int, embed_layers: int, activation: Callable[[tf.Tensor], tf.Tensor]): super().__init__() self.embedding_dim, self.activation, self.num_embeding_layers = embed_dim, activation, embed_layers self.layers = [tf.keras.layers.Dense(self.embedding_dim, activation=self.activation) for _ in range(self.num_embeding_layers)] def get_config(self): config = super(FCEmbedding, self).get_config() config.update({name: getattr(self, name) for name in ["embedding_dim", "num_embeding_layers", "activation"]}) return config def call(self, inputs): """Forward pass of the embedding layer Args: inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, num_features)` Returns: tf.Tensor: output tensor of shape `(batch_size, num_particles, embed_dim)` """ hidden = inputs for layer in self.layers: hidden = layer(hidden) return hidden
Ancestors
- keras.engine.base_layer.Layer
- tensorflow.python.module.module.Module
- tensorflow.python.trackable.autotrackable.AutoTrackable
- tensorflow.python.trackable.base.Trackable
- keras.utils.version_utils.LayerVersionSelector
Methods
def call(self, inputs)
-
Forward pass of the embedding layer
Args
inputs
:tf.Tensor
- input tensor of shape
(batch_size, num_particles, num_features)
Returns
tf.Tensor
- output tensor of shape
(batch_size, num_particles, embed_dim)
Expand source code
def call(self, inputs): """Forward pass of the embedding layer Args: inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, num_features)` Returns: tf.Tensor: output tensor of shape `(batch_size, num_particles, embed_dim)` """ hidden = inputs for layer in self.layers: hidden = layer(hidden) return hidden
def get_config(self)
-
Returns the config of the layer.
A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration.
The config of a layer does not include connectivity information, nor the layer class name. These are handled by
Network
(one layer of abstraction above).Note that
get_config()
does not guarantee to return a fresh copy of dict every time it is called. The callers should make a copy of the returned dict if they want to modify it.Returns
Python dictionary.
Expand source code
def get_config(self): config = super(FCEmbedding, self).get_config() config.update({name: getattr(self, name) for name in ["embedding_dim", "num_embeding_layers", "activation"]}) return config
class FFN (dim: int, expansion: int, activation: Callable[[tensorflow.python.framework.ops.Tensor], tensorflow.python.framework.ops.Tensor], dropout: Optional[float] = None)
-
Feed-forward network
Args
dim
:int
- dimension of the input and output
expansion
:int
- expansion factor of the hidden layer, i.e. the hidden layer has size
dim * expansion
- activation (Callable[[tf.Tensor], tf.Tensor]) activation function
dropout
:float
, optional- dropout rate. Defaults to None.
Expand source code
class FFN(tf.keras.layers.Layer): """Feed-forward network Args: dim (int): dimension of the input and output expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion` activation (Callable[[tf.Tensor], tf.Tensor]) activation function dropout (float, optional): dropout rate. Defaults to None. """ def __init__(self, dim: int, expansion: int, activation: Callable[[tf.Tensor], tf.Tensor], dropout: Optional[float] = None): super().__init__() self.dim, self.expansion, self.activation, self.dropout = dim, expansion, activation, dropout self.wide_dense = tf.keras.layers.Dense(dim * expansion, activation=activation) self.dense = tf.keras.layers.Dense(dim, activation=None) self.layer_dropout = tf.keras.layers.Dropout(dropout) def get_config(self): config = super(FFN, self).get_config() config.update({"dim": self.dim, "expansion": self.expansion, "activation": self.activation, "dropout": self.dropout}) return config def call(self, inputs: tf.Tensor) -> tf.Tensor: """Forward pass of the feed-forward network Args: inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)` Returns: tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)` """ output = self.wide_dense(inputs) output = self.dense(output) output = self.layer_dropout(output) output = self.layer_dropout(output) return output
Ancestors
- keras.engine.base_layer.Layer
- tensorflow.python.module.module.Module
- tensorflow.python.trackable.autotrackable.AutoTrackable
- tensorflow.python.trackable.base.Trackable
- keras.utils.version_utils.LayerVersionSelector
Methods
def call(self, inputs: tensorflow.python.framework.ops.Tensor) ‑> tensorflow.python.framework.ops.Tensor
-
Forward pass of the feed-forward network
Args
inputs
:tf.Tensor
- input tensor of shape
(batch_size, num_particles, dim)
Returns
tf.Tensor
- output tensor of shape
(batch_size, num_particles, dim)
Expand source code
def call(self, inputs: tf.Tensor) -> tf.Tensor: """Forward pass of the feed-forward network Args: inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)` Returns: tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)` """ output = self.wide_dense(inputs) output = self.dense(output) output = self.layer_dropout(output) output = self.layer_dropout(output) return output
def get_config(self)
-
Returns the config of the layer.
A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration.
The config of a layer does not include connectivity information, nor the layer class name. These are handled by
Network
(one layer of abstraction above).Note that
get_config()
does not guarantee to return a fresh copy of dict every time it is called. The callers should make a copy of the returned dict if they want to modify it.Returns
Python dictionary.
Expand source code
def get_config(self): config = super(FFN, self).get_config() config.update({"dim": self.dim, "expansion": self.expansion, "activation": self.activation, "dropout": self.dropout}) return config
class MultiheadSelfAttention (dim: int, heads: int)
-
Multi-head self-attention layer This layer is a wrapper around the
tf.keras.layers.MultiHeadAttention
layer, to fix the key, value, and query to be the same.Args
dim
:int
- dimension of the input and output
heads
:int
- number of heads
Expand source code
class MultiheadSelfAttention(tf.keras.layers.Layer): """Multi-head self-attention layer This layer is a wrapper around the `tf.keras.layers.MultiHeadAttention` layer, to fix the key, value, and query to be the same. Args: dim (int): dimension of the input and output heads (int): number of heads """ def __init__(self, dim: int, heads: int): super().__init__() self.dim, self.heads = dim, heads self.mha = tf.keras.layers.MultiHeadAttention(key_dim=dim // heads, num_heads=heads) def get_config(self): config = super(MultiheadSelfAttention, self).get_config() config.update({"dim": self.dim, "heads": self.heads}) return config def call(self, inputs: tf.Tensor, mask: tf.Tensor) -> tf.Tensor: """Forward pass of the multi-head self-attention layer Args: inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)` mask (tf.Tensor): mask tensor of shape `(batch_size, num_particles, num_particles)` This mask is used to mask out the attention of padding particles, generated when tf.RaggedTensor is converted to tf.Tensor. Returns: tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)` """ output = self.mha(query=inputs, value=inputs, key=inputs, attention_mask=mask) return output
Ancestors
- keras.engine.base_layer.Layer
- tensorflow.python.module.module.Module
- tensorflow.python.trackable.autotrackable.AutoTrackable
- tensorflow.python.trackable.base.Trackable
- keras.utils.version_utils.LayerVersionSelector
Methods
def call(self, inputs: tensorflow.python.framework.ops.Tensor, mask: tensorflow.python.framework.ops.Tensor) ‑> tensorflow.python.framework.ops.Tensor
-
Forward pass of the multi-head self-attention layer
Args
inputs
:tf.Tensor
- input tensor of shape
(batch_size, num_particles, dim)
mask
:tf.Tensor
- mask tensor of shape
(batch_size, num_particles, num_particles)
This mask is used to mask out the attention of padding particles, generated when tf.RaggedTensor is converted to tf.Tensor.
Returns
tf.Tensor
- output tensor of shape
(batch_size, num_particles, dim)
Expand source code
def call(self, inputs: tf.Tensor, mask: tf.Tensor) -> tf.Tensor: """Forward pass of the multi-head self-attention layer Args: inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)` mask (tf.Tensor): mask tensor of shape `(batch_size, num_particles, num_particles)` This mask is used to mask out the attention of padding particles, generated when tf.RaggedTensor is converted to tf.Tensor. Returns: tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)` """ output = self.mha(query=inputs, value=inputs, key=inputs, attention_mask=mask) return output
def get_config(self)
-
Returns the config of the layer.
A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration.
The config of a layer does not include connectivity information, nor the layer class name. These are handled by
Network
(one layer of abstraction above).Note that
get_config()
does not guarantee to return a fresh copy of dict every time it is called. The callers should make a copy of the returned dict if they want to modify it.Returns
Python dictionary.
Expand source code
def get_config(self): config = super(MultiheadSelfAttention, self).get_config() config.update({"dim": self.dim, "heads": self.heads}) return config
class SelfAttentionBlock (dim: int, heads: int, expansion: int, activation: Callable[[tensorflow.python.framework.ops.Tensor], tensorflow.python.framework.ops.Tensor], dropout: Optional[float] = None, *args, **kwargs)
-
Self-attention block. It contains a multi-head self-attention layer and a feed-forward network with residual connections and layer normalizations.
Args
dim
:int
- dimension of the input and output
heads
:int
- number of heads
expansion
:int
- expansion factor of the hidden layer, i.e. the hidden layer has size
dim * expansion
- activation (Callable[[tf.Tensor], tf.Tensor]) activation function
dropout
:float
, optional- dropout rate. Defaults to None.
Expand source code
class SelfAttentionBlock(tf.keras.layers.Layer): """Self-attention block. It contains a multi-head self-attention layer and a feed-forward network with residual connections and layer normalizations. Args: dim (int): dimension of the input and output heads (int): number of heads expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion` activation (Callable[[tf.Tensor], tf.Tensor]) activation function dropout (float, optional): dropout rate. Defaults to None. """ def __init__(self, dim: int, heads: int, expansion: int, activation: Callable[[tf.Tensor], tf.Tensor], dropout: Optional[float] = None, *args, **kwargs): super().__init__(*args, **kwargs) self.dim, self.heads, self.dropout = dim, heads, dropout self.expansion, self.activation = expansion, activation self.mhsa_ln = tf.keras.layers.LayerNormalization() self.mhsa = MultiheadSelfAttention(dim, heads) self.mhsa_dropout = tf.keras.layers.Dropout(dropout) self.ffn_ln = tf.keras.layers.LayerNormalization() self.ffn = FFN(dim, expansion, activation, dropout) def call(self, inputs: tf.Tensor, mask: tf.Tensor) -> tf.Tensor: """Forward pass of the self-attention block Args: inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)` mask (tf.Tensor): mask tensor of shape `(batch_size, num_particles, num_particles)` This mask is used to mask out the attention of padding particles, generated when tf.RaggedTensor is converted to tf.Tensor. Returns: tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)` """ attented = self.mhsa_ln(inputs) attented = self.mhsa(attented, mask) attented = self.mhsa_dropout(attented) attented = attented + inputs ffned = self.ffn_ln(attented) ffned = self.ffn(ffned) output = ffned + attented return output
Ancestors
- keras.engine.base_layer.Layer
- tensorflow.python.module.module.Module
- tensorflow.python.trackable.autotrackable.AutoTrackable
- tensorflow.python.trackable.base.Trackable
- keras.utils.version_utils.LayerVersionSelector
Methods
def call(self, inputs: tensorflow.python.framework.ops.Tensor, mask: tensorflow.python.framework.ops.Tensor) ‑> tensorflow.python.framework.ops.Tensor
-
Forward pass of the self-attention block
Args
inputs
:tf.Tensor
- input tensor of shape
(batch_size, num_particles, dim)
mask
:tf.Tensor
- mask tensor of shape
(batch_size, num_particles, num_particles)
This mask is used to mask out the attention of padding particles, generated when tf.RaggedTensor is converted to tf.Tensor.
Returns
tf.Tensor
- output tensor of shape
(batch_size, num_particles, dim)
Expand source code
def call(self, inputs: tf.Tensor, mask: tf.Tensor) -> tf.Tensor: """Forward pass of the self-attention block Args: inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)` mask (tf.Tensor): mask tensor of shape `(batch_size, num_particles, num_particles)` This mask is used to mask out the attention of padding particles, generated when tf.RaggedTensor is converted to tf.Tensor. Returns: tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)` """ attented = self.mhsa_ln(inputs) attented = self.mhsa(attented, mask) attented = self.mhsa_dropout(attented) attented = attented + inputs ffned = self.ffn_ln(attented) ffned = self.ffn(ffned) output = ffned + attented return output
class Transformer (layers: int, dim: int, expansion: int, heads: int, activation: Callable[[tensorflow.python.framework.ops.Tensor], tensorflow.python.framework.ops.Tensor], dropout: Optional[float] = None)
-
Pure Transformer layers without embedding and output layers.
It also creates the class token, which is used to encode the global information of the input, by concatenating the class token to the input.
Args
layers
:int
- number of Self-Attention layers
dim
:int
- dimension of the input and output
expansion
:int
- expansion factor of the hidden layer, i.e. the hidden layer has size
dim * expansion
heads
:int
- number of heads
- activation (Callable[[tf.Tensor], tf.Tensor]) activation function
dropout
:float
, optional- dropout rate. Defaults to None.
Expand source code
class Transformer(tf.keras.layers.Layer): """Pure Transformer layers without embedding and output layers. It also creates the class token, which is used to encode the global information of the input, by concatenating the class token to the input. Args: layers (int): number of Self-Attention layers dim (int): dimension of the input and output expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion` heads (int): number of heads activation (Callable[[tf.Tensor], tf.Tensor]) activation function dropout (float, optional): dropout rate. Defaults to None. """ def __init__(self, layers: int, dim: int, expansion: int, heads: int, activation: Callable[[tf.Tensor], tf.Tensor], dropout: Optional[float] = None): # Make sure `dim` is even. assert dim % 2 == 0 super().__init__() self.layers, self.dim, self.expansion, self.heads, self.dropout, self.activation = layers, dim, expansion, heads, dropout, activation self.class_token = tf.Variable(initial_value=tf.random.truncated_normal( (1, 1, dim), stddev=0.02), trainable=True) self.sa_layers = [SelfAttentionBlock(dim, heads, expansion, activation, dropout) for _ in range(layers)] def get_config(self): config = super(Transformer, self).get_config() config.update({name: getattr(self, name) for name in ["layers", "dim", "expansion", "heads", "dropout", "activation"]}) return config def call(self, inputs: tf.Tensor, mask: tf.Tensor) -> tf.Tensor: """Forward pass of the Transformer layers Args: inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)` mask (tf.Tensor): mask tensor of shape `(batch_size, num_particles)`. From the mask, a mask tensor of shape `(batch_size, num_particles, num_particles)` is calculated, which is used to mask out the attention of padding particles, generated when `tf.RaggedTensor` is converted to `tf.Tensor`. Returns: tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)` """ mask = mask[:, tf.newaxis, :] & mask[:, :, tf.newaxis] class_tokens = tf.tile(self.class_token, [tf.shape(inputs)[0], 1, 1]) hidden = tf.concat([class_tokens, inputs], axis=1) for sa_block in self.sa_layers: hidden = sa_block(hidden, mask) return hidden
Ancestors
- keras.engine.base_layer.Layer
- tensorflow.python.module.module.Module
- tensorflow.python.trackable.autotrackable.AutoTrackable
- tensorflow.python.trackable.base.Trackable
- keras.utils.version_utils.LayerVersionSelector
Methods
def call(self, inputs: tensorflow.python.framework.ops.Tensor, mask: tensorflow.python.framework.ops.Tensor) ‑> tensorflow.python.framework.ops.Tensor
-
Forward pass of the Transformer layers
Args
inputs
:tf.Tensor
- input tensor of shape
(batch_size, num_particles, dim)
mask
:tf.Tensor
- mask tensor of shape
(batch_size, num_particles)
. From the mask, a mask tensor of shape(batch_size, num_particles, num_particles)
is calculated, which is used to mask out the attention of padding particles, generated whentf.RaggedTensor
is converted totf.Tensor
.
Returns
tf.Tensor
- output tensor of shape
(batch_size, num_particles, dim)
Expand source code
def call(self, inputs: tf.Tensor, mask: tf.Tensor) -> tf.Tensor: """Forward pass of the Transformer layers Args: inputs (tf.Tensor): input tensor of shape `(batch_size, num_particles, dim)` mask (tf.Tensor): mask tensor of shape `(batch_size, num_particles)`. From the mask, a mask tensor of shape `(batch_size, num_particles, num_particles)` is calculated, which is used to mask out the attention of padding particles, generated when `tf.RaggedTensor` is converted to `tf.Tensor`. Returns: tf.Tensor: output tensor of shape `(batch_size, num_particles, dim)` """ mask = mask[:, tf.newaxis, :] & mask[:, :, tf.newaxis] class_tokens = tf.tile(self.class_token, [tf.shape(inputs)[0], 1, 1]) hidden = tf.concat([class_tokens, inputs], axis=1) for sa_block in self.sa_layers: hidden = sa_block(hidden, mask) return hidden
def get_config(self)
-
Returns the config of the layer.
A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration.
The config of a layer does not include connectivity information, nor the layer class name. These are handled by
Network
(one layer of abstraction above).Note that
get_config()
does not guarantee to return a fresh copy of dict every time it is called. The callers should make a copy of the returned dict if they want to modify it.Returns
Python dictionary.
Expand source code
def get_config(self): config = super(Transformer, self).get_config() config.update({name: getattr(self, name) for name in ["layers", "dim", "expansion", "heads", "dropout", "activation"]}) return config
class TransformerModel (input_shape: Tuple[None, int], embed_dim: int, embed_layers: int, self_attn_layers: int, expansion: int, heads: int, dropout: float, output_layer: keras.engine.base_layer.Layer, activation: Callable[[tensorflow.python.framework.ops.Tensor], tensorflow.python.framework.ops.Tensor], preprocess: Optional[keras.engine.base_layer.Layer] = None)
-
Transformer model with embedding and output layers.
The model already contains the
tf.keras.layers.Input
layer, so it can be used as a standalone model.The input tensor is first passed through the embedding layer, then the Transformer layers, and finally the output layer. If the preprocessing layer is not None, the input tensor is first passed through the preprocessing layer before the embedding layer. The input to the output layer is the extracted class token.
Args
input_shape
:Tuple[int]
- shape of the input
embed_dim
:int
- dimension of the embedding
embed_layers
:int
- number of fully-connected layers in the embedding
self_attn_layers
:int
- number of Self-Attention layers
expansion
:int
- expansion factor of the hidden layer, i.e. the hidden layer has size
dim * expansion
heads
:int
- number of heads
dropout
:float
, optional- dropout rate. Defaults to None.
output_layer
:tf.keras.layers.Layer
- output layer
- activation (Callable[[tf.Tensor], tf.Tensor]) activation function used in all the layers
preprocess
:tf.keras.layers.Layer
, optional- preprocessing layer. Defaults to None.
Expand source code
class TransformerModel(tf.keras.Model): """Transformer model with embedding and output layers. The model already contains the `tf.keras.layers.Input` layer, so it can be used as a standalone model. The input tensor is first passed through the embedding layer, then the Transformer layers, and finally the output layer. If the preprocessing layer is not None, the input tensor is first passed through the preprocessing layer before the embedding layer. The input to the output layer is the extracted class token. Args: input_shape (Tuple[int]): shape of the input embed_dim (int): dimension of the embedding embed_layers (int): number of fully-connected layers in the embedding self_attn_layers (int): number of Self-Attention layers expansion (int): expansion factor of the hidden layer, i.e. the hidden layer has size `dim * expansion` heads (int): number of heads dropout (float, optional): dropout rate. Defaults to None. output_layer (tf.keras.layers.Layer): output layer activation (Callable[[tf.Tensor], tf.Tensor]) activation function used in all the layers preprocess (tf.keras.layers.Layer, optional): preprocessing layer. Defaults to None. """ def __init__(self, input_shape: Tuple[None, int], embed_dim: int, embed_layers: int, self_attn_layers: int, expansion: int, heads: int, dropout: float, output_layer: tf.keras.layers.Layer, activation: Callable[[tf.Tensor], tf.Tensor], preprocess: Optional[tf.keras.layers.Layer] = None): input = tf.keras.layers.Input(shape=input_shape, ragged=True) row_lengths = input.row_lengths() hidden = input.to_tensor() if preprocess is not None: hidden = preprocess(hidden) hidden = FCEmbedding(embed_dim, embed_layers, activation)(hidden) row_lengths += 1 transformed = Transformer(self_attn_layers, embed_dim, expansion, heads, activation, dropout)(hidden, mask=tf.sequence_mask(row_lengths)) transformed = tf.keras.layers.LayerNormalization()(transformed[:, 0, :]) output = output_layer(transformed) super().__init__(inputs=input, outputs=output)
Ancestors
- keras.engine.training.Model
- keras.engine.base_layer.Layer
- tensorflow.python.module.module.Module
- tensorflow.python.trackable.autotrackable.AutoTrackable
- tensorflow.python.trackable.base.Trackable
- keras.utils.version_utils.LayerVersionSelector
- keras.utils.version_utils.ModelVersionSelector