CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/vi/chapter7/section4_tf.ipynb
Views: 2548
Kernel: Unknown Kernel

Dịch máy (TensorFlow)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

!pip install datasets evaluate transformers[sentencepiece] !apt install git-lfs

You will need to setup git, adapt your email and name in the following cell.

!git config --global user.email "[email protected]" !git config --global user.name "Your Name"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

from huggingface_hub import notebook_login notebook_login()
from datasets import load_dataset raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")
raw_datasets
DatasetDict({ train: Dataset({ features: ['id', 'translation'], num_rows: 210173 }) })
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20) split_datasets
DatasetDict({ train: Dataset({ features: ['id', 'translation'], num_rows: 189155 }) test: Dataset({ features: ['id', 'translation'], num_rows: 21018 }) })
split_datasets["validation"] = split_datasets.pop("test")
split_datasets["train"][1]["translation"]
{'en': 'Default to expanded threads', 'fr': 'Par défaut, développer les fils de discussion'}
from transformers import pipeline model_checkpoint = "Helsinki-NLP/opus-mt-en-fr" translator = pipeline("translation", model=model_checkpoint) translator("Default to expanded threads")
[{'translation_text': 'Par défaut pour les threads élargis'}]
split_datasets["train"][172]["translation"]
{'en': 'Unable to import %1 using the OFX importer plugin. This file is not the correct format.', 'fr': "Impossible d'importer %1 en utilisant le module d'extension d'importation OFX. Ce fichier n'a pas un format correct."}
translator( "Unable to import %1 using the OFX importer plugin. This file is not the correct format." )
[{'translation_text': "Impossible d'importer %1 en utilisant le plugin d'importateur OFX. Ce fichier n'est pas le bon format."}]
from transformers import AutoTokenizer model_checkpoint = "Helsinki-NLP/opus-mt-en-fr" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="tf")
en_sentence = split_datasets["train"][1]["translation"]["en"] fr_sentence = split_datasets["train"][1]["translation"]["fr"] inputs = tokenizer(en_sentence) with tokenizer.as_target_tokenizer(): targets = tokenizer(fr_sentence)
wrong_targets = tokenizer(fr_sentence) print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"])) print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))
['▁Par', '▁dé', 'f', 'aut', ',', '▁dé', 've', 'lop', 'per', '▁les', '▁fil', 's', '▁de', '▁discussion', '</s>'] ['▁Par', '▁défaut', ',', '▁développer', '▁les', '▁fils', '▁de', '▁discussion', '</s>']
max_input_length = 128 max_target_length = 128 def preprocess_function(examples): inputs = [ex["en"] for ex in examples["translation"]] targets = [ex["fr"] for ex in examples["translation"]] model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True) # Thiết lập tokenizer cho nhãn with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs
tokenized_datasets = split_datasets.map( preprocess_function, batched=True, remove_columns=split_datasets["train"].column_names, )
from transformers import TFAutoModelForSeq2SeqLM model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)
from transformers import DataCollatorForSeq2Seq data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)]) batch.keys()
dict_keys(['attention_mask', 'input_ids', 'labels', 'decoder_input_ids'])
batch["labels"]
tensor([[ 577, 5891, 2, 3184, 16, 2542, 5, 1710, 0, -100, -100, -100, -100, -100, -100, -100], [ 1211, 3, 49, 9409, 1211, 3, 29140, 817, 3124, 817, 550, 7032, 5821, 7907, 12649, 0]])
batch["decoder_input_ids"]
tensor([[59513, 577, 5891, 2, 3184, 16, 2542, 5, 1710, 0, 59513, 59513, 59513, 59513, 59513, 59513], [59513, 1211, 3, 49, 9409, 1211, 3, 29140, 817, 3124, 817, 550, 7032, 5821, 7907, 12649]])
for i in range(1, 3): print(tokenized_datasets["train"][i]["labels"])
[577, 5891, 2, 3184, 16, 2542, 5, 1710, 0] [1211, 3, 49, 9409, 1211, 3, 29140, 817, 3124, 817, 550, 7032, 5821, 7907, 12649, 0]
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset( columns=["input_ids", "attention_mask", "labels"], collate_fn=data_collator, shuffle=True, batch_size=32, ) tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset( columns=["input_ids", "attention_mask", "labels"], collate_fn=data_collator, shuffle=False, batch_size=16, )
!pip install sacrebleu
import evaluate metric = evaluate.load("sacrebleu")
predictions = [ "This plugin lets you translate web pages between several languages automatically." ] references = [ [ "This plugin allows you to automatically translate web pages between several languages." ] ] metric.compute(predictions=predictions, references=references)
{'score': 46.750469682990165, 'counts': [11, 6, 4, 3], 'totals': [12, 11, 10, 9], 'precisions': [91.67, 54.54, 40.0, 33.33], 'bp': 0.9200444146293233, 'sys_len': 12, 'ref_len': 13}
predictions = ["This This This This"] references = [ [ "This plugin allows you to automatically translate web pages between several languages." ] ] metric.compute(predictions=predictions, references=references)
{'score': 1.683602693167689, 'counts': [1, 0, 0, 0], 'totals': [4, 3, 2, 1], 'precisions': [25.0, 16.67, 12.5, 12.5], 'bp': 0.10539922456186433, 'sys_len': 4, 'ref_len': 13}
predictions = ["This plugin"] references = [ [ "This plugin allows you to automatically translate web pages between several languages." ] ] metric.compute(predictions=predictions, references=references)
{'score': 0.0, 'counts': [2, 1, 0, 0], 'totals': [2, 1, 0, 0], 'precisions': [100.0, 100.0, 0.0, 0.0], 'bp': 0.004086771438464067, 'sys_len': 2, 'ref_len': 13}
import numpy as np def compute_metrics(): all_preds = [] all_labels = [] sampled_dataset = tokenized_datasets["validation"].shuffle().select(range(200)) tf_generate_dataset = sampled_dataset.to_tf_dataset( columns=["input_ids", "attention_mask", "labels"], collate_fn=data_collator, shuffle=False, batch_size=4, ) for batch in tf_generate_dataset: predictions = model.generate( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"] ) decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) labels = batch["labels"].numpy() labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) decoded_preds = [pred.strip() for pred in decoded_preds] decoded_labels = [[label.strip()] for label in decoded_labels] all_preds.extend(decoded_preds) all_labels.extend(decoded_labels) result = metric.compute(predictions=all_preds, references=all_labels) return {"bleu": result["score"]}
from huggingface_hub import notebook_login notebook_login()
print(compute_metrics())
from transformers import create_optimizer from transformers.keras_callbacks import PushToHubCallback import tensorflow as tf # Số bước huấn luyện là số lượng mẫu trong tập dữ liệu, chia cho kích thước lô sau đó nhân # với tổng số epoch. Lưu ý rằng tf_train_dataset ở đây là tf.data.Dataset theo lô, # không phải là Hugging Face Dataset ban đầu, vì vậy len() của nó vốn là num_samples // batch_size. num_epochs = 3 num_train_steps = len(tf_train_dataset) * num_epochs optimizer, schedule = create_optimizer( init_lr=5e-5, num_warmup_steps=0, num_train_steps=num_train_steps, weight_decay_rate=0.01, ) model.compile(optimizer=optimizer) # Huấn luyện trong mixed-precision float16 tf.keras.mixed_precision.set_global_policy("mixed_float16")
from transformers.keras_callbacks import PushToHubCallback callback = PushToHubCallback( output_dir="marian-finetuned-kde4-en-to-fr", tokenizer=tokenizer ) model.fit( tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback], epochs=num_epochs, )
print(compute_metrics())
from transformers import pipeline # Thay nó với checkpoint của bạn model_checkpoint = "huggingface-course/marian-finetuned-kde4-en-to-fr" translator = pipeline("translation", model=model_checkpoint) translator("Default to expanded threads")
[{'translation_text': 'Par défaut, développer les fils de discussion'}]
translator( "Unable to import %1 using the OFX importer plugin. This file is not the correct format." )
[{'translation_text': "Impossible d'importer %1 en utilisant le module externe d'importation OFX. Ce fichier n'est pas le bon format."}]