CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/videos/mlm_processing.ipynb
Views: 2542
Kernel: Unknown Kernel

This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.

#@title from IPython.display import HTML HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/8PmhEIXhBvI?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Install the Transformers and Datasets libraries to run this notebook.

! pip install datasets transformers[sentencepiece]
from datasets import load_dataset raw_datasets = load_dataset("wikitext", "wikitext-2-raw-v1") raw_datasets["train"]
from datasets import load_dataset from transformers import AutoTokenizer raw_datasets = load_dataset("imdb") raw_datasets = raw_datasets.remove_columns("label") model_checkpoint = "distilbert-base-cased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) context_length = 128 def tokenize_pad_and_truncate(texts): return tokenizer(texts["text"], truncation=True, padding="max_length", max_length=context_length) tokenized_datasets = raw_datasets.map(tokenize_pad_and_truncate, batched=True)
def tokenize_and_chunk(texts): return tokenizer( texts["text"], truncation=True, max_length=context_length, return_overflowing_tokens=True ) tokenized_datasets = raw_datasets.map( tokenize_and_chunk, batched=True, remove_columns=["text"] ) len(raw_datasets["train"]), len(tokenized_datasets["train"])
def tokenize_and_chunk(texts): all_input_ids = [] for input_ids in tokenizer(texts["text"])["input_ids"]: all_input_ids.extend(input_ids) all_input_ids.append(tokenizer.eos_token_id) chunks = [] for idx in range(0, len(all_input_ids), context_length): chunks.append(all_input_ids[idx: idx + context_length]) return {"input_ids": chunks} tokenized_datasets = raw_datasets.map(tokenize_and_chunk, batched=True, remove_columns=["text"]) len(raw_datasets["train"]), len(tokenized_datasets["train"])
from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)