Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/main/course/fr/chapter6/section5.ipynb
Views: 2548
Kernel: Python 3
Tokenisation Byte-Pair Encoding
Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce notebook.
In [ ]:
!pip install datasets transformers[sentencepiece]
In [ ]:
corpus = [ "C'est le cours d'Hugging Face.", "Ce chapitre traite de la tokenisation.", "Cette section présente plusieurs algorithmes de tokenizer.", "Avec un peu de chance, vous serez en mesure de comprendre comment ils sont entraînés et génèrent des tokens.", ]
In [ ]:
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("asi/gpt-fr-cased-small")
In [ ]:
from collections import defaultdict word_freqs = defaultdict(int) for text in corpus: words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text) new_words = [word for word, offset in words_with_offsets] for word in new_words: word_freqs[word] += 1 print(word_freqs)
In [ ]:
alphabet = [] for word in word_freqs.keys(): for letter in word: if letter not in alphabet: alphabet.append(letter) alphabet.sort() print(alphabet)
In [ ]:
vocab = ["<|endoftext|>"] + alphabet.copy()
In [ ]:
splits = {word: [c for c in word] for word in word_freqs.keys()}
In [ ]:
def compute_pair_freqs(splits): pair_freqs = defaultdict(int) for word, freq in word_freqs.items(): split = splits[word] if len(split) == 1: continue for i in range(len(split) - 1): pair = (split[i], split[i + 1]) pair_freqs[pair] += freq return pair_freqs
In [ ]:
pair_freqs = compute_pair_freqs(splits) for i, key in enumerate(pair_freqs.keys()): print(f"{key}: {pair_freqs[key]}") if i >= 5: break
In [ ]:
best_pair = "" max_freq = None for pair, freq in pair_freqs.items(): if max_freq is None or max_freq < freq: best_pair = pair max_freq = freq print(best_pair, max_freq)
In [ ]:
merges = {("Ġ", "t"): "Ġt"} vocab.append("Ġt")
In [ ]:
def merge_pair(a, b, splits): for word in word_freqs: split = splits[word] if len(split) == 1: continue i = 0 while i < len(split) - 1: if split[i] == a and split[i + 1] == b: split = split[:i] + [a + b] + split[i + 2 :] else: i += 1 splits[word] = split return splits
In [ ]:
splits = merge_pair("Ġ", "t", splits) splits
In [ ]:
vocab_size = 50 while len(vocab) < vocab_size: pair_freqs = compute_pair_freqs(splits) best_pair = "" max_freq = None for pair, freq in pair_freqs.items(): if max_freq is None or max_freq < freq: best_pair = pair max_freq = freq splits = merge_pair(*best_pair, splits) merges[best_pair] = best_pair[0] + best_pair[1] vocab.append(best_pair[0] + best_pair[1])
In [ ]:
print(merges)
In [ ]:
print(vocab)
In [ ]:
def tokenize(text): pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text) pre_tokenized_text = [word for word, offset in pre_tokenize_result] splits = [[l for l in word] for word in pre_tokenized_text] for pair, merge in merges.items(): for idx, split in enumerate(splits): i = 0 while i < len(split) - 1: if split[i] == pair[0] and split[i + 1] == pair[1]: split = split[:i] + [merge] + split[i + 2 :] else: i += 1 splits[idx] = split return sum(splits, [])
In [ ]:
tokenize("Ce n'est pas un token.")