CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/videos/qa_processing.ipynb
Views: 2542
Kernel: Unknown Kernel

This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.

#@title from IPython.display import HTML HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/qgaM0weJHpA?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Install the Transformers and Datasets libraries to run this notebook.

! pip install datasets transformers[sentencepiece]
from datasets import load_dataset raw_datasets = load_dataset("squad") raw_datasets = raw_datasets.remove_columns(["id", "title"]) def prepare_data(example): answer = example["answers"]["text"][0] example["answer_start"] = example["answers"]["answer_start"][0] example["answer_end"] = example["answer_start"] + len(answer) return example raw_datasets = raw_datasets.map(prepare_data, remove_columns=["answers"]) raw_datasets["train"]
print(f"Context: {raw_datasets['train'][0]['context']") print(f"Question: {raw_datasets['train'][0]['question']") start = raw_datasets["train"][0]["answer_start"] end = raw_datasets["train"][0]["answer_end"] print(f"\nAnswer: {raw_datasets['train'][0]['context'][start:end]}")
from transformers import AutoTokenizer model_checkpoint = "bert-base-cased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) example = raw_datasets["train"][0] inputs = tokenizer( example["question"], example["context"], truncation="only_second", padding="max_length", max_length=384, stride=128, return_overflowing_tokens=True, return_offsets_mapping=True )
def find_labels(offsets, answer_start, answer_end, sequence_ids): idx = 0 while sequence_ids[idx] != 1: idx += 1 context_start = idx while sequence_ids[idx] == 1: idx += 1 context_end = idx - 1 # If the answer is not fully in the context, return (0, 0) if offsets[context_start][0] > answer_end or offsets[context_end][1] < answer_start: return (0, 0) else: idx = context_start while idx <= context_end and offsets[idx][0] <= answer_start: idx += 1 start_position = idx - 1 idx = context_end while idx >= context_start and offsets[idx][1] >= answer_end: idx -= 1 end_position = idx + 1 return start_position, end_position
start, end = find_labels( inputs["offset_mapping"][0], example["answer_start"], example["answer_end"], inputs.sequence_ids(0) ) tokenizer.decode(inputs["input_ids"][0][start: end+1])
def preprocess_training_examples(examples): questions = [q.strip() for q in examples["question"]] inputs = tokenizer( examples["question"], examples["context"], truncation="only_second", padding="max_length", max_length=384, stride=128, return_overflowing_tokens=True, return_offsets_mapping=True, ) offset_mapping = inputs.pop("offset_mapping") sample_map = inputs.pop("overflow_to_sample_mapping") inputs["start_positions"] = [] inputs["end_positions"] = [] for i, offset in enumerate(offset_mapping): sample_idx = sample_map[i] start, end = find_labels( offset, examples["answer_start"][sample_idx], examples["answer_end"][sample_idx], inputs.sequence_ids(i) ) inputs["start_positions"].append(start) inputs["end_positions"].append(end) return inputs
tokenized_datasets = raw_datasets.map( preprocess_training_examples, batched=True, remove_columns=raw_datasets["train"].column_names, )