My Colaboratory Notebook
I've done everything the video was showing except the first two cells in the section "Get Data from Kaggle", because I didn't use kaggle but a json file I scraped from wikiquote of Fullmetal Alchemist, I also made sure not to get any empty values.
But, when I run the main function cell, it throws a TypeError.
Here's the stacktrace:
09/22/2021 23:21:33 - WARNING - __main__ - Process rank: -1, device: cuda, n_gpu: 1, distributed training: False, 16-bits training: False
/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:592: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.
FutureWarning,
09/22/2021 23:21:41 - INFO - __main__ - Training/evaluation parameters <__main__.Args object at 0x7f0379708d10>
09/22/2021 23:21:41 - INFO - __main__ - Creating features from dataset file at cached
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-50-523c0d2a27d3> in <module>()
----> 1 main(trn_df, val_df)
10 frames
<ipython-input-49-aa20b6fc78bc> in main(df_trn, df_val)
61 # Training
62 if args.do_train:
---> 63 train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)
64
65 global_step, tr_loss = train(args, train_dataset, model, tokenizer)
<ipython-input-40-67f62bb60333> in load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate)
2
3 def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
----> 4 return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)
5
6
<ipython-input-39-a654172287f5> in __init__(self, tokenizer, args, df, block_size)
25 self.examples = []
26 for _, row in df.iterrows():
---> 27 conv = construct_conv(row, tokenizer)
28 self.examples.append(conv)
29
<ipython-input-39-a654172287f5> in construct_conv(row, tokenizer, eos)
2 def construct_conv(row, tokenizer, eos = True):
3 flatten = lambda l: [item for sublist in l for item in sublist]
----> 4 conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
5 conv = flatten(conv)
6 return conv
<ipython-input-39-a654172287f5> in <listcomp>(.0)
2 def construct_conv(row, tokenizer, eos = True):
3 flatten = lambda l: [item for sublist in l for item in sublist]
----> 4 conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
5 conv = flatten(conv)
6 return conv
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in encode(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, return_tensors, **kwargs)
2160 stride=stride,
2161 return_tensors=return_tensors,
-> 2162 **kwargs,
2163 )
2164
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in encode_plus(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2488 return_length=return_length,
2489 verbose=verbose,
-> 2490 **kwargs,
2491 )
2492
/usr/local/lib/python3.7/dist-packages/transformers/models/gpt2/tokenization_gpt2_fast.py in _encode_plus(self, *args, **kwargs)
171 )
172
--> 173 return super()._encode_plus(*args, **kwargs)
174
175 def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_fast.py in _encode_plus(self, text, text_pair, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
493 return_length=return_length,
494 verbose=verbose,
--> 495 **kwargs,
496 )
497
/usr/local/lib/python3.7/dist-packages/transformers/models/gpt2/tokenization_gpt2_fast.py in _batch_encode_plus(self, *args, **kwargs)
161 )
162
--> 163 return super()._batch_encode_plus(*args, **kwargs)
164
165 def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_fast.py in _batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose)
406 batch_text_or_text_pairs,
407 add_special_tokens=add_special_tokens,
--> 408 is_pretokenized=is_split_into_words,
409 )
410
TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]