CoCalc -- 4 Hugging Face lib functions.ipynb

GitHub Repository: suyashi29/python-su
Path: blob/master/GenAI Transformers Basics/4 Hugging Face lib functions.ipynb
³⁰⁷⁴ views

Kernel: Python 3 (ipykernel)

The Hugging Face transformers library provides a variety of pretrained models and pipelines that can perform different natural language processing (NLP) tasks beyond sentiment analysis. Below are some examples of tasks you can perform using different pipelines provided by the transformers library, along with a set of functions demonstrating these capabilities.

pip install wordcloud --trusted-host pypi.org --trusted-host files.pythonhosted.org transformers==4.9.2 torch==1.9.0

In [2]:

#1. Text Classification (other than sentiment analysis)
#Function: Classify text into predefined categories.
from transformers import pipeline
# Explicitly specify the model name
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

def classify_text(text, model_name="distilbert-base-uncased-finetuned-sst-2-english"):
    classifier = pipeline("text-classification", model=model_name)
    results = classifier(text)
    return results

# Example usage
text = "I hated the movie"
print(classify_text(text))

Out[2]:

[{'label': 'NEGATIVE', 'score': 0.9996954202651978}]

In [3]:

from transformers import pipeline

def recognize_entities(text):
    model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
    ner_pipeline = pipeline("ner", model=model_name, grouped_entities=True)
    results = ner_pipeline(text)
    return results

# Example usage
text = "Ashi is born in Delhi."
print(recognize_entities(text))

Out[3]:

Downloading:   0%|          | 0.00/998 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

C:\Users\suyashi144893\Anaconda3\lib\site-packages\transformers\pipelines\token_classification.py:154: UserWarning: `grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to `aggregation_strategy="AggregationStrategy.SIMPLE"` instead.
  warnings.warn(

[{'entity_group': 'PER', 'score': 0.9957955, 'word': 'Ashi', 'start': 0, 'end': 4}, {'entity_group': 'LOC', 'score': 0.99833214, 'word': 'Delhi', 'start': 16, 'end': 21}]

In [4]:

from transformers import pipeline

def answer_question(question, context):
    model_name = "distilbert-base-cased-distilled-squad"
    qa_pipeline = pipeline("question-answering", model=model_name)
    results = qa_pipeline(question=question, context=context)
    return results

# Example usage
context = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge."
question = "Where is Hugging Face Inc. based?"
print(answer_question(question, context))

Out[4]:

Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'score': 0.975246012210846, 'start': 40, 'end': 53, 'answer': 'New York City'}

In [5]:

from transformers import pipeline

def summarize_text(text, max_length=50, min_length=25):
    model_name = "facebook/bart-large-cnn"
    summarization_pipeline = pipeline("summarization", model=model_name)
    summary = summarization_pipeline(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']

# Example usage
text = "A good human embodies kindness, empathy, and integrity. They act selflessly, helping others and showing compassion. Honesty and respect guide their interactions, fostering trust and positive relationships. A good human values diversity, promotes equality, and strives to make the world a better place through their actions and understanding."
print(summarize_text(text))

Out[5]:

Downloading:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

C:\Users\suyashi144893\Anaconda3\lib\site-packages\torch\_tensor.py:575: UserWarning: floor_divide is deprecated, and will be removed in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ..\aten\src\ATen\native\BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)

A good human embodies kindness, empathy, and integrity. They act selflessly, helping others and showing compassion. Honesty and respect guide their interactions, fostering trust and positive relationships.

In [7]:

from transformers import pipeline

def generate_text(prompt, max_length=50):
    model_name = "gpt2"
    text_generator = pipeline("text-generation", model=model_name)
    generated_text = text_generator(prompt, max_length=max_length, num_return_sequences=1)
    return generated_text[0]['generated_text']

# Example usage
prompt = input("Enter initial text or topic:")
print(generate_text(prompt))

Out[7]:

Enter initial text or topic:Language

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.

Language by the Book Society of America.

It's so nice to be able to be so transparent about how we are going to use our power.

I've been very lucky the time is right now. I have lots of work to

In [ ]:

from transformers import pipeline

def translate_text(text, source_lang="en", target_lang="fr"):
    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    translation_pipeline = pipeline("translation", model=model_name)
    translation = translation_pipeline(text)
    return translation[0]['translation_text']

# Example usage
text = "A pretty cat"
print(translate_text(text, source_lang="en", target_lang="fr"))

In [ ]:

# Install or update required libraries
pip install wordcloud --trusted-ho st pypi.org --trusted-host files.pythonhosted.org --upgrade datasets transformers huggingface_hub

In [ ]:

# Import the necessary modules
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset, load_metric
import torch
import numpy as np

# Load dataset
dataset = load_dataset("ag_news")

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

# Tokenize the input data
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Create a data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create a Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)

In [ ]:

Product

Resources

Company