CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/videos/semantic_search.ipynb
Views: 2542
Kernel: Unknown Kernel

This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.

#@title from IPython.display import HTML HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/OATCgQtNX2o?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Install the Transformers and Datasets libraries to run this notebook.

! pip install datasets transformers[sentencepiece]
import torch from transformers import AutoTokenizer, AutoModel sentences = [ "I took my dog for a walk", "Today is going to rain", "I took my cat for a walk", ] model_ckpt = "sentence-transformers/all-MiniLM-L6-v2" tokenizer = AutoTokenizer.from_pretrained(model_ckpt) model = AutoModel.from_pretrained(model_ckpt) encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt") with torch.no_grad(): model_output = model(**encoded_input) token_embeddings = model_output.last_hidden_state print(f"Token embeddings shape: {token_embeddings.size()}")
import torch.nn.functional as F def mean_pooling(model_output, attention_mask): token_embeddings = model_output.last_hidden_state input_mask_expanded = ( attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() ) return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( input_mask_expanded.sum(1), min=1e-9 ) sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"]) # Normalize the embeddings sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) print(f"Sentence embeddings shape: {sentence_embeddings.size()}")
import numpy as np from sklearn.metrics.pairwise import cosine_similarity sentence_embeddings = sentence_embeddings.detach().numpy() scores = np.zeros((sentence_embeddings.shape[0], sentence_embeddings.shape[0])) for idx in range(sentence_embeddings.shape[0]): scores[idx, :] = cosine_similarity([sentence_embeddings[idx]], sentence_embeddings)[0]
from datasets import load_dataset squad = load_dataset("squad", split="validation").shuffle(seed=42).select(range(100)) def get_embeddings(text_list): encoded_input = tokenizer( text_list, padding=True, truncation=True, return_tensors="pt" ) encoded_input = {k: v for k, v in encoded_input.items()} with torch.no_grad(): model_output = model(**encoded_input) return mean_pooling(model_output, encoded_input["attention_mask"]) squad_with_embeddings = squad.map( lambda x: {"embeddings": get_embeddings(x["context"]).cpu().numpy()[0]} )
squad_with_embeddings.add_faiss_index(column="embeddings") question = "Who headlined the halftime show for Super Bowl 50?" question_embedding = get_embeddings([question]).cpu().detach().numpy() scores, samples = squad_with_embeddings.get_nearest_examples( "embeddings", question_embedding, k=3 )