CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/zh-CN/chapter8/section4.ipynb
Views: 2548
Kernel: Unknown Kernel

调试训练管道

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

!pip install datasets evaluate transformers[sentencepiece]
from datasets import load_dataset, load_metric from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, ) raw_datasets = load_dataset("glue", "mnli") model_checkpoint = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) def preprocess_function(examples): return tokenizer(examples["premise"], examples["hypothesis"], truncation=True) tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) args = TrainingArguments( f"distilbert-finetuned-mnli", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, num_train_epochs=3, weight_decay=0.01, ) metric = load_metric("glue", "mnli") def compute_metrics(eval_pred): predictions, labels = eval_pred return metric.compute(predictions=predictions, references=labels) trainer = Trainer( model, args, train_dataset=raw_datasets["train"], eval_dataset=raw_datasets["validation_matched"], compute_metrics=compute_metrics, ) trainer.train()
'ValueError: You have to specify either input_ids or inputs_embeds'
trainer.train_dataset[0]
{'hypothesis': 'Product and geography are what make cream skimming work. ', 'idx': 0, 'label': 1, 'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.'}
from datasets import load_dataset, load_metric from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, ) raw_datasets = load_dataset("glue", "mnli") model_checkpoint = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) def preprocess_function(examples): return tokenizer(examples["premise"], examples["hypothesis"], truncation=True) tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) args = TrainingArguments( f"distilbert-finetuned-mnli", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, num_train_epochs=3, weight_decay=0.01, ) metric = load_metric("glue", "mnli") def compute_metrics(eval_pred): predictions, labels = eval_pred return metric.compute(predictions=predictions, references=labels) trainer = Trainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation_matched"], compute_metrics=compute_metrics, ) trainer.train()
'ValueError: expected sequence of length 43 at dim 1 (got 37)'
tokenizer.decode(trainer.train_dataset[0]["input_ids"])
'[CLS] conceptually cream skimming has two basic dimensions - product and geography. [SEP] product and geography are what make cream skimming work. [SEP]'
trainer.train_dataset[0].keys()
dict_keys(['attention_mask', 'hypothesis', 'idx', 'input_ids', 'label', 'premise'])
type(trainer.model)
transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification
tokenizer.decode(trainer.train_dataset[0]["attention_mask"])
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
len(trainer.train_dataset[0]["attention_mask"]) == len( trainer.train_dataset[0]["input_ids"] )
True
trainer.train_dataset[0]["label"]
1
trainer.train_dataset.features["label"].names
['entailment', 'neutral', 'contradiction']
for batch in trainer.get_train_dataloader(): break
~/git/transformers/src/transformers/data/data_collator.py in torch_default_data_collator(features) 105 batch[k] = torch.stack([f[k] for f in features]) 106 else: --> 107 batch[k] = torch.tensor([f[k] for f in features]) 108 109 return batch ValueError: expected sequence of length 45 at dim 1 (got 76)
data_collator = trainer.get_train_dataloader().collate_fn data_collator
<function transformers.data.data_collator.default_data_collator(features: List[InputDataClass], return_tensors='pt') -> Dict[str, Any]>
from datasets import load_dataset, load_metric from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, ) raw_datasets = load_dataset("glue", "mnli") model_checkpoint = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) def preprocess_function(examples): return tokenizer(examples["premise"], examples["hypothesis"], truncation=True) tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) args = TrainingArguments( f"distilbert-finetuned-mnli", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, num_train_epochs=3, weight_decay=0.01, ) metric = load_metric("glue", "mnli") def compute_metrics(eval_pred): predictions, labels = eval_pred return metric.compute(predictions=predictions, references=labels) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) trainer = Trainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation_matched"], compute_metrics=compute_metrics, data_collator=data_collator, tokenizer=tokenizer, ) trainer.train()
RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`
data_collator = trainer.get_train_dataloader().collate_fn batch = data_collator([trainer.train_dataset[i] for i in range(4)])
data_collator = trainer.get_train_dataloader().collate_fn actual_train_set = trainer._remove_unused_columns(trainer.train_dataset) batch = data_collator([actual_train_set[i] for i in range(4)])
for batch in trainer.get_train_dataloader(): break
outputs = trainer.model.cpu()(**batch)
~/.pyenv/versions/3.7.9/envs/base/lib/python3.7/site-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction) 2386 ) 2387 if dim == 2: -> 2388 ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index) 2389 elif dim == 4: 2390 ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index) IndexError: Target 2 is out of bounds.
trainer.model.config.num_labels
2
from datasets import load_dataset, load_metric from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, ) raw_datasets = load_dataset("glue", "mnli") model_checkpoint = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) def preprocess_function(examples): return tokenizer(examples["premise"], examples["hypothesis"], truncation=True) tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3) args = TrainingArguments( f"distilbert-finetuned-mnli", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, num_train_epochs=3, weight_decay=0.01, ) metric = load_metric("glue", "mnli") def compute_metrics(eval_pred): predictions, labels = eval_pred return metric.compute(predictions=predictions, references=labels) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) trainer = Trainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation_matched"], compute_metrics=compute_metrics, data_collator=data_collator, tokenizer=tokenizer, )
for batch in trainer.get_train_dataloader(): break outputs = trainer.model.cpu()(**batch)
import torch device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") batch = {k: v.to(device) for k, v in batch.items()} outputs = trainer.model.to(device)(**batch)
loss = outputs.loss loss.backward()
trainer.create_optimizer() trainer.optimizer.step()
# This will take a long time and error out, so you shouldn't run this cell trainer.train()
TypeError: only size-1 arrays can be converted to Python scalars
trainer.evaluate()
TypeError: only size-1 arrays can be converted to Python scalars
for batch in trainer.get_eval_dataloader(): break batch = {k: v.to(device) for k, v in batch.items()} with torch.no_grad(): outputs = trainer.model(**batch)
predictions = outputs.logits.cpu().numpy() labels = batch["labels"].cpu().numpy() compute_metrics((predictions, labels))
TypeError: only size-1 arrays can be converted to Python scalars
predictions.shape, labels.shape
((8, 3), (8,))
import numpy as np def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) return metric.compute(predictions=predictions, references=labels) compute_metrics((predictions, labels))
{'accuracy': 0.625}
import numpy as np from datasets import load_dataset, load_metric from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, ) raw_datasets = load_dataset("glue", "mnli") model_checkpoint = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) def preprocess_function(examples): return tokenizer(examples["premise"], examples["hypothesis"], truncation=True) tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3) args = TrainingArguments( f"distilbert-finetuned-mnli", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, num_train_epochs=3, weight_decay=0.01, ) metric = load_metric("glue", "mnli") def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) return metric.compute(predictions=predictions, references=labels) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) trainer = Trainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation_matched"], compute_metrics=compute_metrics, data_collator=data_collator, tokenizer=tokenizer, ) trainer.train()
for batch in trainer.get_train_dataloader(): break batch = {k: v.to(device) for k, v in batch.items()} trainer.create_optimizer() for _ in range(20): outputs = trainer.model(**batch) loss = outputs.loss loss.backward() trainer.optimizer.step() trainer.optimizer.zero_grad()
with torch.no_grad(): outputs = trainer.model(**batch) preds = outputs.logits labels = batch["labels"] compute_metrics((preds.cpu().numpy(), labels.cpu().numpy()))
{'accuracy': 1.0}