CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/zh-CN/chapter3/section3_tf.ipynb
Views: 2548
Kernel: Unknown Kernel

使用 Trainer API 或者 Keras 微调一个模型

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

!pip install datasets evaluate transformers[sentencepiece]
from datasets import load_dataset from transformers import AutoTokenizer, DataCollatorWithPadding import numpy as np raw_datasets = load_dataset("glue", "mrpc") checkpoint = "bert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_function(example): return tokenizer(example["sentence1"], example["sentence2"], truncation=True) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") tf_train_dataset = tokenized_datasets["train"].to_tf_dataset( columns=["attention_mask", "input_ids", "token_type_ids"], label_cols=["labels"], shuffle=True, collate_fn=data_collator, batch_size=8, ) tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset( columns=["attention_mask", "input_ids", "token_type_ids"], label_cols=["labels"], shuffle=False, collate_fn=data_collator, batch_size=8, )
from transformers import TFAutoModelForSequenceClassification model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
from tensorflow.keras.losses import SparseCategoricalCrossentropy model.compile( optimizer="adam", loss=SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"], ) model.fit( tf_train_dataset, validation_data=tf_validation_dataset, )
from tensorflow.keras.optimizers.schedules import PolynomialDecay batch_size = 8 num_epochs = 3 # 训练步数是数据集中的样本数除以batch size再乘以 epoch。 # 注意这里的tf_train_dataset是一个转化为batch后的 tf.data.Dataset, # 不是原来的 Hugging Face Dataset,所以它的 len() 已经是 num_samples // batch_size。 num_train_steps = len(tf_train_dataset) * num_epochs lr_scheduler = PolynomialDecay( initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps ) from tensorflow.keras.optimizers import Adam opt = Adam(learning_rate=lr_scheduler)
import tensorflow as tf model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)
preds = model.predict(tf_validation_dataset)["logits"]
class_preds = np.argmax(preds, axis=1) print(preds.shape, class_preds.shape)
(408, 2) (408,)
import evaluate metric = evaluate.load("glue", "mrpc") metric.compute(predictions=class_preds, references=raw_datasets["validation"]["label"])
{'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}