This method is only called in ``__getitem__``, chunked out separately
for readability.
Parameters
----------
sequences : List[List[int]]
List of tokenized sequences, each sequence is typically a
List[int].
Returns
-------
torch.Tensor, torch.Tensor
Tensor of sequences padded to max length, and length of sequences
before padding.
"""
for i in range(len(sequences)):
sequences[i] = sequences[i][
: self.config["max_sequence_length"] - 1
]
sequence_lengths = [len(sequence) for sequence in sequences]
# Pad all sequences to max_sequence_length.
maxpadded_sequences = torch.full(
(len(sequences), self.config["max_sequence_length"]),
fill_value=self.vocabulary.PAD_INDEX,
)
padded_sequences = pad_sequence(
[torch.tensor(sequence) for sequence in sequences],
batch_first=True,
padding_value=self.vocabulary.PAD_INDEX,
)
maxpadded_sequences[:, : padded_sequences.size(1)] = padded_sequences
return maxpadded_sequences, sequence_lengths`