CoCalc -- transformer.ipynb~

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

"Guiding Future STEM Leaders through Innovative Research Training" ~ thinkingbeyond.education

Path: ThinkingBeyond Activities / BeyondAI-2024-Mentee-Projects / louis-jack / transformer.ipynb~

Views: ¹¹⁸³
Image: ubuntu2204

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "uZbDVal8mAeP",
    "outputId": "42178cc8-1fd5-4b38-a041-5bd2db4691bd"
   },
   "outputs": [],
   "source": [
    "# Transformer Model Training and Evaluation\n",
    "# This notebook demonstrates how to train and evaluate a transformer model using the IMDb dataset.\n",
    "\n",
    "import os\n",
    "os.environ[\"WANDB_DISABLED\"] = \"true\"  # Disable Weights and Biases logging\n",
    "%pip install transformers datasets evaluate torch  # Install necessary packages\n",
    "\n",
    "import torch\n",
    "from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, Trainer, TrainingArguments\n",
    "from datasets import load_dataset\n",
    "import evaluate\n",
    "import time"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We define the model configuration, initialize the model, and set the training arguments."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load and Preprocess Data\n",
    "We load the IMDb dataset and preprocess it by tokenizing the text data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "yuDETMBT_yLa"
   },
   "outputs": [],
   "source": [
    "dataset = load_dataset(\"imdb\")\n",
    "\n",
    "train_data = dataset[\"train\"]\n",
    "test_data = dataset[\"test\"]\n",
    "\n",
    "model_name = \"bert-base-uncased\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)  # Initialize the tokenizer\n",
    "config = AutoConfig.from_pretrained(model_name)  # Load model configuration\n",
    "config.num_labels = 2  # Set the number of labels for classification\n",
    "\n",
    "model = AutoModelForSequenceClassification.from_config(config)  # Initialize the model\n",
    "\n",
    "def preprocess_function(examples):\n",
    "    # Tokenize the text data\n",
    "    return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n",
    "\n",
    "# Tokenize the training and test datasets\n",
    "tokenized_train = train_data.map(preprocess_function, batched=True)\n",
    "tokenized_test = test_data.map(preprocess_function, batched=True)\n",
    "\n",
    "accuracy_metric = evaluate.load(\"accuracy\")  # Load the accuracy metric\n",
    "\n",
    "def compute_metrics(eval_pred):\n",
    "    # Compute evaluation metrics\n",
    "    logits, labels = eval_pred\n",
    "    predictions = torch.argmax(torch.tensor(logits), dim=1)\n",
    "    return accuracy_metric.compute(predictions=predictions, references=labels)\n",
    "\n",
    "# Set training arguments\n",
    "training_args = TrainingArguments(\n",
    "    output_dir=\"./results\",\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    learning_rate=2e-5,\n",
    "    per_device_train_batch_size=8,\n",
    "    per_device_eval_batch_size=8,\n",
    "    num_train_epochs=3,\n",
    "    weight_decay=0.01,\n",
    "    logging_dir='./logs',\n",
    "    save_strategy=\"epoch\",\n",
    "    save_total_limit=1,\n",
    ")\n",
    "\n",
    "# Initialize the Trainer\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=tokenized_train,\n",
    "    eval_dataset=tokenized_test,\n",
    "    tokenizer=tokenizer,\n",
    "    compute_metrics=compute_metrics\n",
    ")\n",
    "\n",
    "print(\"Training model...\")\n",
    "start_time = time.time()\n",
    "trainer.train()  # Train the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "CNB0wZDe_69s"
   },
   "outputs": [],
   "source": [
    "# Measure GPU memory usage after training\n",
    "if torch.cuda.is_available():\n",
    "    print(f\"GPU Memory Usage After Training: {torch.cuda.memory_allocated() / 1024**3:.2f} GB\")\n",
    "\n",
    "end_time = time.time()\n",
    "training_time = end_time - start_time\n",
    "print(f\"Training time: {training_time:.2f} seconds\")\n",
    "\n",
    "# Evaluate the model\n",
    "print(\"Evaluating model...\")\n",
    "start_time = time.time()\n",
    "results = trainer.evaluate()  # Evaluate the model on the test set\n",
    "\n",
    "# Measure GPU memory usage after evaluation\n",
    "if torch.cuda.is_available():\n",
    "    print(f\"GPU Memory Usage After Evaluation: {torch.cuda.memory_allocated() / 1024**3:.2f} GB\")\n",
    "\n",
    "end_time = time.time()\n",
    "inference_time = end_time - start_time\n",
    "print(f\"Inference time (evaluation on the test set): {inference_time:.2f} seconds\")\n",
    "print(results)\n",
    "\n",
    "# Extract evaluation metrics\n",
    "accuracy = results.get(\"eval_accuracy\", None)\n",
    "precision = results.get(\"eval_precision\", None)\n",
    "recall = results.get(\"eval_recall\", None)\n",
    "f1_score = results.get(\"eval_f1\", None)\n",
    "roc_auc = results.get(\"eval_roc_auc\", None)\n",
    "\n",
    "# GPU memory usage during evaluation and training\n",
    "gpu_memory_after_training = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else None\n",
    "gpu_memory_after_evaluation = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else None\n",
    "\n",
    "# Display the metrics\n",
    "print(f\"Accuracy: {accuracy:.4f}\" if accuracy is not None else \"Accuracy metric not found.\")\n",
    "print(f\"Precision: {precision:.4f}\" if precision is not None else \"Precision metric not found.\")\n",
    "print(f\"Recall: {recall:.4f}\" if recall is not None else \"Recall metric not found.\")\n",
    "print(f\"F1-Score: {f1_score:.4f}\" if f1_score is not None else \"F1-Score metric not found.\")\n",
    "print(f\"ROC-AUC: {roc_auc:.4f}\" if roc_auc is not None else \"ROC-AUC metric not found.\")\n",
    "\n",
    "# Displaying GPU memory usage\n",
    "if gpu_memory_after_training:\n",
    "    print(f\"GPU Memory Usage After Training: {gpu_memory_after_training:.2f} GB\")\n",
    "if gpu_memory_after_evaluation:\n",
    "    print(f\"GPU Memory Usage After Evaluation: {gpu_memory_after_evaluation:.2f} GB\")"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "T4",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

Product

Resources

Company

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more, all in one place. Commercial Alternative to JupyterHub.

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.