CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/zh-CN/chapter6/section2.ipynb
Views: 2555
Kernel: Unknown Kernel

根据已有的tokenizer训练新的tokenizer

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

!pip install datasets evaluate transformers[sentencepiece] !apt install git-lfs

You will need to setup git, adapt your email and name in the following cell.

!git config --global user.email "[email protected]" !git config --global user.name "Your Name"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

from huggingface_hub import notebook_login notebook_login()
from datasets import load_dataset # This can take a few minutes to load, so grab a coffee or tea while you wait! raw_datasets = load_dataset("code_search_net", "python")
raw_datasets["train"]
Dataset({ features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url' ], num_rows: 412178 })
print(raw_datasets["train"][123456]["whole_func_string"])
# Don't uncomment the following line unless your dataset is small! # training_corpus = [raw_datasets["train"][i: i + 1000]["whole_func_string"] for i in range(0, len(raw_datasets["train"]), 1000)]
training_corpus = ( raw_datasets["train"][i : i + 1000]["whole_func_string"] for i in range(0, len(raw_datasets["train"]), 1000) )
gen = (i for i in range(10)) print(list(gen)) print(list(gen))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] []
def get_training_corpus(): return ( raw_datasets["train"][i : i + 1000]["whole_func_string"] for i in range(0, len(raw_datasets["train"]), 1000) ) training_corpus = get_training_corpus()
def get_training_corpus(): dataset = raw_datasets["train"] for start_idx in range(0, len(dataset), 1000): samples = dataset[start_idx : start_idx + 1000] yield samples["whole_func_string"]
from transformers import AutoTokenizer old_tokenizer = AutoTokenizer.from_pretrained("gpt2")
example = '''def add_numbers(a, b): """Add the two numbers `a` and `b`.""" return a + b''' tokens = old_tokenizer.tokenize(example) tokens
['def', 'Ġadd', '_', 'n', 'umbers', '(', 'a', ',', 'Ġb', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ"""', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`', '."', '""', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)
tokens = tokenizer.tokenize(example) tokens
['def', 'Ġadd', '_', 'numbers', '(', 'a', ',', 'Ġb', '):', 'ĊĠĠĠ', 'Ġ"""', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`."""', 'ĊĠĠĠ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']
print(len(tokens)) print(len(old_tokenizer.tokenize(example)))
27 36
example = """class LinearLayer(): def __init__(self, input_size, output_size): self.weight = torch.randn(input_size, output_size) self.bias = torch.zeros(output_size) def __call__(self, x): return x @ self.weights + self.bias """ tokenizer.tokenize(example)
['class', 'ĠLinear', 'Layer', '():', 'ĊĠĠĠ', 'Ġdef', 'Ġ__', 'init', '__(', 'self', ',', 'Ġinput', '_', 'size', ',', 'Ġoutput', '_', 'size', '):', 'ĊĠĠĠĠĠĠĠ', 'Ġself', '.', 'weight', 'Ġ=', 'Ġtorch', '.', 'randn', '(', 'input', '_', 'size', ',', 'Ġoutput', '_', 'size', ')', 'ĊĠĠĠĠĠĠĠ', 'Ġself', '.', 'bias', 'Ġ=', 'Ġtorch', '.', 'zeros', '(', 'output', '_', 'size', ')', 'ĊĊĠĠĠ', 'Ġdef', 'Ġ__', 'call', '__(', 'self', ',', 'Ġx', '):', 'ĊĠĠĠĠĠĠĠ', 'Ġreturn', 'Ġx', 'Ġ@', 'Ġself', '.', 'weights', 'Ġ+', 'Ġself', '.', 'bias', 'ĊĠĠĠĠ']
tokenizer.save_pretrained("code-search-net-tokenizer")
from huggingface_hub import notebook_login notebook_login()
tokenizer.push_to_hub("code-search-net-tokenizer")
# Replace "huggingface-course" below with your actual namespace to use your own tokenizer tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")