Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/main/course/fr/chapter6/section7.ipynb
Views: 2548
Kernel: Python 3
Unigram tokenization
Nous gardons un modèle en anglais ici car il n'existe pas de modèle en français utilisant la tokenisation Unigram.
Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce notebook.
In [ ]:
!pip install datasets transformers[sentencepiece]
In [ ]:
corpus = [ "This is the Hugging Face Course.", "This chapter is about tokenization.", "This section shows several tokenizer algorithms.", "Hopefully, you will be able to understand how they are trained and generate tokens.", ]
In [ ]:
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
In [ ]:
from collections import defaultdict word_freqs = defaultdict(int) for text in corpus: words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text) new_words = [word for word, offset in words_with_offsets] for word in new_words: word_freqs[word] += 1 word_freqs
In [ ]:
char_freqs = defaultdict(int) subwords_freqs = defaultdict(int) for word, freq in word_freqs.items(): for i in range(len(word)): char_freqs[word[i]] += freq # Boucle à travers les sous-mots de longueur au moins égale à 2 for j in range(i + 2, len(word) + 1): subwords_freqs[word[i:j]] += freq # Trier les sous-mots par fréquence sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True) sorted_subwords[:10]
In [ ]:
token_freqs = list(char_freqs.items()) + sorted_subwords[: 300 - len(char_freqs)] token_freqs = {token: freq for token, freq in token_freqs}
In [ ]:
from math import log total_sum = sum([freq for token, freq in token_freqs.items()]) model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}
In [ ]:
def encode_word(word, model): best_segmentations = [{"start": 0, "score": 1}] + [ {"start": None, "score": None} for _ in range(len(word)) ] for start_idx in range(len(word)): # Il doit être correctement rempli par les étapes précédentes de la boucle. best_score_at_start = best_segmentations[start_idx]["score"] for end_idx in range(start_idx + 1, len(word) + 1): token = word[start_idx:end_idx] if token in model and best_score_at_start is not None: score = model[token] + best_score_at_start # Si nous avons trouvé une meilleure segmentation se terminant à end_idx, nous mettons à jour if ( best_segmentations[end_idx]["score"] is None or best_segmentations[end_idx]["score"] > score ): best_segmentations[end_idx] = {"start": start_idx, "score": score} segmentation = best_segmentations[-1] if segmentation["score"] is None: # Nous n'avons pas trouvé de tokenization du mot -> inconnu return ["<unk>"], None score = segmentation["score"] start = segmentation["start"] end = len(word) tokens = [] while start != 0: tokens.insert(0, word[start:end]) next_start = best_segmentations[start]["start"] end = start start = next_start tokens.insert(0, word[start:end]) return tokens, score
In [ ]:
print(encode_word("Hopefully", model)) print(encode_word("This", model))
In [ ]:
def compute_loss(model): loss = 0 for word, freq in word_freqs.items(): _, word_loss = encode_word(word, model) loss += freq * word_loss return loss
In [ ]:
compute_loss(model)
In [ ]:
import copy def compute_scores(model): scores = {} model_loss = compute_loss(model) for token, score in model.items(): # Nous gardons toujours les tokens de longueur 1. if len(token) == 1: continue model_without_token = copy.deepcopy(model) _ = model_without_token.pop(token) scores[token] = compute_loss(model_without_token) - model_loss return scores
In [ ]:
scores = compute_scores(model) print(scores["ll"]) print(scores["his"])
In [ ]:
percent_to_remove = 0.1 while len(model) > 100: scores = compute_scores(model) sorted_scores = sorted(scores.items(), key=lambda x: x[1]) # Supprime les tokens percent_to_remove ayant les scores les plus bas. for i in range(int(len(model) * percent_to_remove)): _ = token_freqs.pop(sorted_scores[i][0]) total_sum = sum([freq for token, freq in token_freqs.items()]) model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}
In [ ]:
def tokenize(text, model): words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text) pre_tokenized_text = [word for word, offset in words_with_offsets] encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text] return sum(encoded_words, []) tokenize("This is the Hugging Face course.", model)