Spaces:

lucabadiali
/

ML_OPS_Project

Running

App Files Files Community

lucabadiali commited on Nov 10

Commit

b603fd0

1 Parent(s): f97ec54

Added app testing

Browse files

Files changed (17) hide show

README.md +57 -1
data/load_data.py +11 -0
nb.ipynb +382 -148
pytest.ini +2 -0
src/__pycache__/app.cpython-311.pyc +0 -0
src/__pycache__/utils.cpython-311.pyc +0 -0
src/app/__init__.py +0 -0
src/app/__pycache__/__init__.cpython-311.pyc +0 -0
src/app/__pycache__/app.cpython-311.pyc +0 -0
src/app/__pycache__/utils.cpython-311.pyc +0 -0
src/app/app.py +77 -0
src/app/app_post.py +22 -0
src/app/utils.py +7 -0
tests/__pycache__/test_app.cpython-311-pytest-9.0.0.pyc +0 -0
tests/__pycache__/test_data.cpython-311-pytest-9.0.0.pyc +0 -0
tests/test_app.py +89 -0
tests/test_data.py +4 -0

README.md CHANGED Viewed

	@@ -1 +1,57 @@
1	- # MLOPS_Project

+# MLOPS_Project
+FASE 1)
+    - Riuscire ad allenare un modello
+FASE 2)
+Public Colab notebook (single link)
+Loads a ready model: cardiffnlp/twitter-roberta-base-sentiment-latest (or -sep2022).
+Loads a public dataset (e.g., tweet_eval/sentiment).
+Runs inference + evaluation (accuracy, F1 macro, recall macro).
+(Optional but easy) light fine-tuning on a fraction of the data (small batch, few epochs).
+Shows a tiny monitoring demo: aggregate % positive/neutral/negative over a sample and plot a time series (synthetic timestamps are fine).
+Links to your GitHub repo at the top.
+Public GitHub repo
+src/ with:
+train.py — fine-tuning script (works on CPU/MPS/CUDA; small batch + gradient accumulation).
+eval.py — evaluate a model checkpoint on validation/test.
+infer.py — batch inference from CSV/JSONL.
+app.py — (optional) Gradio mini UI.
+data_utils.py — your subset functions + tokenization helpers.
+requirements.txt
+README.md — how to run locally + what the project does.
+.github/workflows/ci.yml — CI runs lint + tests + a tiny dry-run of training (e.g., 500 samples, 1 epoch).
+MODEL_CARD.md — brief model card (data, metrics, limits/bias).
+tests/test_smoke.py — imports + 10-sample training/eval smoke test.
+Minimal documentation (in README)
+Goal: monitor social sentiment for MachineInnovators Inc.
+Model choice: use pre-trained RoBERTa; FastText kept as optional baseline.
+Pipeline overview: data → tokenize → (optional fine-tune) → evaluate → artifact save → (optional deploy).
+How to reproduce: exact commands.
+Monitoring idea: log predictions; compute daily sentiment mix; simple drift check (distribution shift of logits).

data/load_data.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from datasets import load_dataset, DatasetDict
+from pathlib import Path
+DATA_FOLDER_PATH = Path(__file__).resolve().parent
+dataset_path = DATA_FOLDER_PATH / "dataset"
+# def get_tweet_eval_sentiment() -> DatasetDict:
+#     return load_dataset("tweet_eval", "sentiment")
+dataset = load_dataset("tweet_eval", "sentiment")
+dataset.save_to_disk(dataset_path)

nb.ipynb CHANGED Viewed

@@ -2,17 +2,25 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 3,
    "id": "3a03d7b9",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "device(type='cpu')"
       ]
      },
-     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -28,8 +36,10 @@
     "import torch.utils.data as data_utils\n",
     "import torch\n",
     "\n",
-    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-    "device\n"
    ]
   },
   {
@@ -50,7 +60,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "id": "0b451180",
    "metadata": {},
    "outputs": [
@@ -60,7 +70,7 @@
        "['negative', 'neutral', 'positive']"
       ]
      },
-     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -79,103 +89,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
-   "id": "ede5d09e",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
      ]
     }
    ],
    "source": [
-    "MODEL = \"FacebookAI/roberta-base\"\n",
-    "model = RobertaForSequenceClassification.from_pretrained(\n",
-    "    MODEL, num_labels=3, problem_type=\"multi_label_classification\")\n",
-    "tokenizer = AutoTokenizer.from_pretrained(MODEL)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 92,
-   "id": "c4bafe30",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_text_file = \"train_text.txt\"\n",
-    "with open(train_text_file, \"r\") as f:\n",
-    "    texts = f.readlines()\n",
-    "\n",
-    "train_label_file = \"train_labels.txt\"\n",
-    "with open(train_label_file, \"r\") as f:\n",
-    "    labels = f.readlines()\n",
     "\n",
-    "len(texts), len(labels)\n",
-    "\n",
-    "texts, labels = texts[:100], labels[:100]\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "87030ba1",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(2, 100)"
-      ]
-     },
-     "execution_count": 93,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "encoded_inputs = tokenizer([ preprocess(t.strip()) for t in texts], return_tensors='pt',   padding=True,\n",
-    "    truncation=True)\n",
-    "labels = [int(labels[i].strip()) for i in range(len(labels))]\n",
-    "labels = torch.tensor(labels, dtype=torch.int)\n",
-    "len(encoded_inputs), len(labels)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 94,
-   "id": "e9548356",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset = data_utils.TensorDataset(encoded_inputs[\"input_ids\"], encoded_inputs[\"attention_mask\"], labels)\n",
-    "test_dataloader = data_utils.DataLoader(dataset, batch_size=10, shuffle=True)\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2f40f7fd",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "08435697",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "model = AutoModelForSequenceClassification.from_pretrained(MODEL)\n",
     "tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
-    "text = \"Good night 😊\"\n",
     "text = preprocess(text)\n",
     "encoded_input = tokenizer(text, return_tensors='pt')\n",
     "output = model(**encoded_input)\n",
@@ -185,7 +117,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "id": "cf6dfc8f",
    "metadata": {},
    "outputs": [
@@ -193,9 +125,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1) positive 0.8466\n",
-      "2) neutral 0.1458\n",
-      "3) negative 0.0076\n"
      ]
     }
    ],
@@ -218,7 +150,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "id": "0a6382f4",
    "metadata": {},
    "outputs": [],
@@ -232,7 +164,7 @@
     "from datasets import load_dataset, concatenate_datasets\n",
     "\n",
     "def tokenize_function(examples):\n",
-    "    return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n",
     "\n",
     "def compute_metrics(eval_pred):\n",
     "    logits, labels = eval_pred\n",
@@ -247,110 +179,412 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "id": "dafaf26d",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Map: 100%|██████████| 45615/45615 [00:10<00:00, 4346.61 examples/s]\n",
-      "Map: 100%|██████████| 12284/12284 [00:03<00:00, 3758.76 examples/s]\n",
-      "Map: 100%|██████████| 2000/2000 [00:00<00:00, 4820.04 examples/s]\n",
       "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sep2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "/workspaces/MLOPS_Project/Env/lib/python3.12/site-packages/torch/utils/data/dataloader.py:668: UserWarning: 'pin_memory' argument is set as true but no accelerator is found, then device pinned memory won't be used.\n",
-      "  warnings.warn(warn_msg)\n"
      ]
     },
     {
-     "ename": "",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
-      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
-      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
-      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
-     ]
     }
    ],
    "source": [
-    "MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sep2022'  # change to desired model from the hub\n",
-    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
     "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
     "\n",
-    "# augment train set with test set, for downstream apps only - DO NOT EVALUATE ON TEST\n",
-    "# tokenized_datasets['train+test'] = concatenate_datasets([tokenized_datasets['train'],\n",
-    "#                                                          tokenized_datasets['test']])\n",
     "\n",
-    "model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)\n",
     "\n",
     "\n",
     "training_args = TrainingArguments(\n",
-    "    output_dir=\"test_trainer\",\n",
     "    learning_rate=1e-5,\n",
-    "    per_device_train_batch_size=16,   # modern name\n",
-    "    per_device_eval_batch_size=16,    # modern name\n",
-    "    num_train_epochs=10,\n",
     "    weight_decay=0.01,\n",
     "    warmup_ratio=0.1,\n",
     "\n",
-    "    eval_strategy=\"epoch\",\n",
-    "    logging_strategy=\"epoch\",\n",
-    "    save_strategy=\"epoch\",\n",
     "\n",
     "    load_best_model_at_end=True,\n",
     "    metric_for_best_model=\"recall\",\n",
     "    greater_is_better=True,\n",
     "    report_to=\"none\",\n",
     ")\n",
     "\n",
-    "metric = evaluate.load('recall')  # default metric for sentiment dataset is recall (macro)\n",
     "\n",
     "trainer = Trainer(\n",
     "    model=model,\n",
     "    args=training_args,\n",
-    "    train_dataset=tokenized_datasets['train'],\n",
-    "    eval_dataset=tokenized_datasets['validation'],\n",
     "    compute_metrics=compute_metrics,\n",
     ")\n",
     "\n",
     "trainer.train()\n",
     "\n",
-    "trainer.create_model_card()\n",
-    "trainer.save_model('saved_model')\n",
-    "\n",
-    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "id": "183032a5",
    "metadata": {},
    "outputs": [
     {
-     "ename": "NameError",
-     "evalue": "name 'torch' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mtorch\u001b[49m.cuda.is_available()\n",
-      "\u001b[31mNameError\u001b[39m: name 'torch' is not defined"
      ]
     }
    ],
    "source": [
-    "torch.cuda.is_available()\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1246f9c6",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -358,7 +592,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Env",
    "language": "python",
    "name": "python3"
   },
@@ -372,7 +606,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.1"
   }
  },
  "nbformat": 4,

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 1,
    "id": "3a03d7b9",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/lucabadiali/Desktop/professionAI/modulo9/Project/MLOPS_Project/ProjectEnv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
+       "device(type='mps')"
       ]
      },
+     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
     "import torch.utils.data as data_utils\n",
     "import torch\n",
     "\n",
+    "\n",
+    "device = torch.device(\"mps\" if torch.mps.is_available() else \n",
+    "                      \"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "device"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "id": "0b451180",
    "metadata": {},
    "outputs": [
        "['negative', 'neutral', 'positive']"
       ]
      },
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
+   "id": "08435697",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
+      "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
      ]
     }
    ],
    "source": [
     "\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\"cardiffnlp/twitter-roberta-base-sentiment-latest\")\n",
     "tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
+    "text = \"today I ate some pasta\"\n",
     "text = preprocess(text)\n",
     "encoded_input = tokenizer(text, return_tensors='pt')\n",
     "output = model(**encoded_input)\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 17,
    "id": "cf6dfc8f",
    "metadata": {},
    "outputs": [
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "1) neutral 0.6674\n",
+      "2) positive 0.3132\n",
+      "3) negative 0.0194\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 21,
    "id": "0a6382f4",
    "metadata": {},
    "outputs": [],
     "from datasets import load_dataset, concatenate_datasets\n",
     "\n",
     "def tokenize_function(examples):\n",
+    "    return tokenizer(examples[\"text\"], max_length=128, truncation=True)\n",
     "\n",
     "def compute_metrics(eval_pred):\n",
     "    logits, labels = eval_pred\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 22,
+   "id": "fcb7fe6d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['text', 'label'],\n",
+       "        num_rows: 45615\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['text', 'label'],\n",
+       "        num_rows: 12284\n",
+       "    })\n",
+       "    validation: Dataset({\n",
+       "        features: ['text', 'label'],\n",
+       "        num_rows: 2000\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "0fabaaea",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "`torch_dtype` is deprecated! Use `dtype` instead!\n",
       "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sep2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "/var/folders/nc/1wpyndzx5ps8nbt0b5zm9jx80000gn/T/ipykernel_2067/3094520460.py:119: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
+      "  trainer = Trainer(\n"
      ]
     },
     {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='858' max='858' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [858/858 21:50, Epoch 3/3]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Accuracy</th>\n",
+       "      <th>F1 Macro</th>\n",
+       "      <th>Recall</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>500</td>\n",
+       "      <td>0.594100</td>\n",
+       "      <td>0.668015</td>\n",
+       "      <td>0.716000</td>\n",
+       "      <td>0.697675</td>\n",
+       "      <td>0.702171</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
+    "import os\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"  # avoid fork/parallelism warnings on macOS\n",
+    "\n",
+    "import torch\n",
+    "from transformers import (\n",
+    "    AutoTokenizer, AutoModelForSequenceClassification,\n",
+    "    TrainingArguments, Trainer, EarlyStoppingCallback,\n",
+    "    DataCollatorWithPadding\n",
+    ")\n",
+    "import evaluate\n",
+    "\n",
+    "# --- Device detection ---\n",
+    "if torch.cuda.is_available():\n",
+    "    device = \"cuda\"\n",
+    "    use_bf16 = torch.cuda.is_bf16_supported()\n",
+    "    use_fp16 = not use_bf16\n",
+    "elif torch.backends.mps.is_available():\n",
+    "    device = \"mps\"\n",
+    "    use_bf16 = False\n",
+    "    use_fp16 = False\n",
+    "else:\n",
+    "    device = \"cpu\"\n",
+    "    use_bf16 = False\n",
+    "    use_fp16 = False\n",
+    "\n",
+    "MODEL_NAME = \"cardiffnlp/twitter-roberta-base-sep2022\"\n",
+    "\n",
+    "# --- Tokenizer: keep short max_length to save memory ---\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, model_max_length=128)\n",
+    "\n",
+    "def tokenize_function(batch):\n",
+    "    return tokenizer(\n",
+    "        batch[\"text\"],\n",
+    "        truncation=True,\n",
+    "        max_length=128,\n",
+    "        padding=False  # we will pad per-batch via DataCollatorWithPadding\n",
+    "    )\n",
+    "\n",
+    "# If your dataset column is \"label\", keep it; Trainer can handle it.\n",
     "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
     "\n",
+    "# --- Data collator that pads dynamically ---\n",
+    "data_collator = DataCollatorWithPadding(\n",
+    "    tokenizer=tokenizer,\n",
+    "    pad_to_multiple_of=8 if (device == \"cuda\" and (use_bf16 or use_fp16)) else None\n",
+    ")\n",
+    "\n",
+    "# --- Model dtype choice ---\n",
+    "if device == \"cuda\" and use_bf16:\n",
+    "    load_dtype = torch.bfloat16\n",
+    "elif device == \"cuda\" and use_fp16:\n",
+    "    load_dtype = torch.float16\n",
+    "else:\n",
+    "    load_dtype = torch.float32  # MPS/CPU -> fp32\n",
     "\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\n",
+    "    MODEL_NAME, num_labels=3, torch_dtype=load_dtype\n",
+    ")\n",
+    "model.gradient_checkpointing_enable()\n",
+    "model.config.use_cache = False\n",
     "\n",
+    "# --- Training args: stop forking on macOS, fix pin_memory ---\n",
+    "trainer_fp16 = bool(device == \"cuda\" and use_fp16)\n",
+    "trainer_bf16 = bool(device == \"cuda\" and use_bf16)\n",
     "\n",
     "training_args = TrainingArguments(\n",
+    "    output_dir=\"artifacts\",\n",
     "    learning_rate=1e-5,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=8,\n",
+    "    gradient_accumulation_steps=8,\n",
+    "    num_train_epochs=3,\n",
     "    weight_decay=0.01,\n",
     "    warmup_ratio=0.1,\n",
+    "    lr_scheduler_type=\"linear\",\n",
     "\n",
+    "    eval_strategy=\"steps\",\n",
+    "    logging_strategy=\"steps\",\n",
+    "    save_strategy=\"steps\",\n",
+    "    eval_steps=500,\n",
+    "    logging_steps=100,\n",
+    "    save_steps=500,\n",
     "\n",
     "    load_best_model_at_end=True,\n",
     "    metric_for_best_model=\"recall\",\n",
     "    greater_is_better=True,\n",
+    "    save_total_limit=2,\n",
+    "\n",
+    "    # Precision\n",
+    "    fp16=trainer_fp16,\n",
+    "    bf16=trainer_bf16,\n",
+    "\n",
+    "    # DataLoader knobs (avoid fork/tokenizers warning on macOS)\n",
+    "    dataloader_num_workers=0,                         # <- key for macOS/MPS\n",
+    "    dataloader_pin_memory=(device == \"cuda\"),         # False on MPS/CPU, True on CUDA\n",
+    "    group_by_length=True,\n",
     "    report_to=\"none\",\n",
     ")\n",
     "\n",
+    "# --- Metrics (macro recall, etc.) ---\n",
+    "recall_metric = evaluate.load(\"recall\")\n",
+    "acc_metric = evaluate.load(\"accuracy\")\n",
+    "f1_metric = evaluate.load(\"f1\")\n",
+    "\n",
+    "def compute_metrics(eval_pred):\n",
+    "    logits, labels = eval_pred\n",
+    "    preds = logits.argmax(axis=-1)\n",
+    "    return {\n",
+    "        \"accuracy\": acc_metric.compute(predictions=preds, references=labels)[\"accuracy\"],\n",
+    "        \"f1_macro\": f1_metric.compute(predictions=preds, references=labels, average=\"macro\")[\"f1\"],\n",
+    "        \"recall\": recall_metric.compute(predictions=preds, references=labels, average=\"macro\")[\"recall\"],\n",
+    "    }\n",
+    "\n",
+    "callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]\n",
+    "\n",
+    "small_train = tokenized_datasets[\"train\"].select(range(100))\n",
+    "small_eval  = tokenized_datasets[\"validation\"].select(range(1))\n",
     "\n",
     "trainer = Trainer(\n",
     "    model=model,\n",
     "    args=training_args,\n",
+    "    train_dataset= train_ds,\n",
+    "    eval_dataset= eval_ds,\n",
     "    compute_metrics=compute_metrics,\n",
+    "    data_collator=data_collator,       # <- important\n",
+    "    tokenizer=tokenizer,\n",
+    "    callbacks=callbacks,\n",
     ")\n",
     "\n",
+    "# Optional explicit device move (Trainer usually handles it)\n",
+    "model.to(device)\n",
     "trainer.train()\n",
     "\n",
+    "trainer.save_model(\"saved_model\")\n",
+    "tokenizer.save_pretrained(\"saved_model\")\n",
+    "try:\n",
+    "    trainer.create_model_card()\n",
+    "except Exception:\n",
+    "    pass\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 20,
+   "id": "30d9a79b",
    "metadata": {},
    "outputs": [
     {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1) neutral 0.7894\n",
+      "2) positive 0.1149\n",
+      "3) negative 0.0957\n"
      ]
     }
    ],
    "source": [
+    "text = \"The second law of thermodynamics is about entropy\"\n",
+    "text = preprocess(text)\n",
+    "encoded_input = tokenizer(text, return_tensors='pt').to(device)\n",
+    "output = model(**encoded_input)\n",
+    "scores = output[0][0].detach().cpu().numpy()\n",
+    "scores = softmax(scores)\n",
+    "ranking = np.argsort(scores)\n",
+    "ranking = ranking[::-1]\n",
+    "for i in range(scores.shape[0]):\n",
+    "    l = labels[ranking[i]]\n",
+    "    s = scores[ranking[i]]\n",
+    "    print(f\"{i+1}) {l} {np.round(float(s), 4)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "c4376c93",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map: 100%|██████████| 1000/1000 [00:00<00:00, 10983.68 examples/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ---- COPY-PASTE FROM HERE ----\n",
+    "import os\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "\n",
+    "from datasets import DatasetDict\n",
+    "from transformers import AutoTokenizer, DataCollatorWithPadding\n",
+    "\n",
+    "def make_trainer_ready(\n",
+    "    raw_ds: DatasetDict,\n",
+    "    model_name: str = \"cardiffnlp/twitter-roberta-base-sep2022\",\n",
+    "    train_frac: float = 0.2,\n",
+    "    val_frac: float = 0.2,\n",
+    "    seed: int = 42,\n",
+    "    label_col: str = \"label\",\n",
+    "    text_col: str = \"text\",\n",
+    "    max_length: int = 128,\n",
+    "    pad_to_multiple_of_8_on_cuda: bool = True,\n",
+    "):\n",
+    "    \"\"\"\n",
+    "    Returns (train_ds, eval_ds, data_collator, tokenizer) ready for HF Trainer.\n",
+    "    - Ensures there's a validation split (creates one from train if missing).\n",
+    "    - Takes fractional subsets, stratified by label when possible.\n",
+    "    - Tokenizes and keeps only the columns Trainer expects.\n",
+    "    \"\"\"\n",
+    "    assert 0 < train_frac <= 1.0, \"train_frac must be in (0,1].\"\n",
+    "    assert 0 < val_frac <= 1.0, \"val_frac must be in (0,1].\"\n",
+    "    assert text_col in raw_ds[\"train\"].column_names, f\"Missing text column: {text_col}\"\n",
+    "    assert label_col in raw_ds[\"train\"].column_names, f\"Missing label column: {label_col}\"\n",
+    "\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, model_max_length=max_length)\n",
+    "\n",
+    "    # 1) Ensure we have a validation split\n",
+    "    if \"validation\" not in raw_ds:\n",
+    "        split = raw_ds[\"train\"].train_test_split(\n",
+    "            test_size=val_frac,\n",
+    "            stratify_by_column=label_col if label_col in raw_ds[\"train\"].column_names else None,\n",
+    "            seed=seed,\n",
+    "        )\n",
+    "        raw_ds = DatasetDict(train=split[\"train\"], validation=split[\"test\"])\n",
+    "    else:\n",
+    "        raw_ds = DatasetDict(train=raw_ds[\"train\"], validation=raw_ds[\"validation\"])\n",
+    "\n",
+    "    # 2) Take fractions (stratified when possible)\n",
+    "    def take_frac(ds, frac):\n",
+    "        if frac >= 1.0:  # keep full split\n",
+    "            return ds\n",
+    "        out = ds.train_test_split(\n",
+    "            test_size=1 - frac,\n",
+    "            stratify_by_column=label_col if label_col in ds.column_names else None,\n",
+    "            seed=seed,\n",
+    "        )\n",
+    "        return out[\"train\"]  # the kept fraction\n",
+    "\n",
+    "    small_train = take_frac(raw_ds[\"train\"], train_frac)\n",
+    "    small_eval  = take_frac(raw_ds[\"validation\"], val_frac)\n",
+    "\n",
+    "    # 3) Tokenize (no padding here; we pad per-batch with the collator)\n",
+    "    def tok(batch):\n",
+    "        return tokenizer(batch[text_col], truncation=True, max_length=max_length, padding=False)\n",
+    "\n",
+    "    small_train_tok = small_train.map(tok, batched=True, remove_columns=[c for c in small_train.column_names if c not in (text_col, label_col)])\n",
+    "    small_eval_tok  = small_eval.map(tok,  batched=True, remove_columns=[c for c in small_eval.column_names  if c not in (text_col, label_col)])\n",
+    "\n",
+    "    # 4) Keep only the columns Trainer needs\n",
+    "    keep_cols = [\"input_ids\", \"attention_mask\", label_col]\n",
+    "    small_train_tok = small_train_tok.remove_columns([c for c in small_train_tok.column_names if c not in keep_cols])\n",
+    "    small_eval_tok  = small_eval_tok.remove_columns([c for c in small_eval_tok.column_names  if c not in keep_cols])\n",
+    "\n",
+    "    # 5) Data collator with dynamic padding (CUDA gets pad_to_multiple_of=8)\n",
+    "    import torch\n",
+    "    pad_to_mult = 8 if (pad_to_multiple_of_8_on_cuda and torch.cuda.is_available()) else None\n",
+    "    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=pad_to_mult)\n",
+    "\n",
+    "    return small_train_tok, small_eval_tok, data_collator, tokenizer\n",
+    "\n",
+    "# ---- USAGE EXAMPLE ----\n",
+    "# Assumes you already have `dataset` (a DatasetDict with 'train' (and maybe 'validation')).\n",
+    "# Example:\n",
+    "# from datasets import load_dataset\n",
+    "# dataset = load_dataset(\"tweet_eval\", \"sentiment\")\n",
+    "\n",
+    "train_ds, eval_ds, data_collator, tokenizer = make_trainer_ready(\n",
+    "    raw_ds=dataset,\n",
+    "    model_name=\"cardiffnlp/twitter-roberta-base-sep2022\",\n",
+    "    train_frac=0.2,    # take 20% of train\n",
+    "    val_frac=0.5,      # take 50% of validation\n",
+    "    seed=42,\n",
+    "    label_col=\"label\",\n",
+    "    text_col=\"text\",\n",
+    "    max_length=128,\n",
+    ")\n",
+    "\n",
+    "# Pass directly to Trainer:\n",
+    "# from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer\n",
+    "# model = AutoModelForSequenceClassification.from_pretrained(\"cardiffnlp/twitter-roberta-base-sep2022\", num_labels=3)\n",
+    "# args = TrainingArguments(output_dir=\"out\", per_device_train_batch_size=4, per_device_eval_batch_size=8, evaluation_strategy=\"epoch\", report_to=\"none\")\n",
+    "# trainer = Trainer(model=model, args=args, train_dataset=train_ds, eval_dataset=eval_ds, data_collator=data_collator, tokenizer=tokenizer)\n",
+    "# trainer.train()\n",
+    "# ---- COPY-PASTE UNTIL HERE ----\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "12f775be",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['label', 'input_ids', 'attention_mask'],\n",
+       "    num_rows: 1000\n",
+       "})"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_ds"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "f87c7153",
    "metadata": {},
    "outputs": [],
    "source": []
  ],
  "metadata": {
   "kernelspec": {
+   "display_name": "ProjectEnv",
    "language": "python",
    "name": "python3"
   },
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.11.10"
   }
  },
  "nbformat": 4,

pytest.ini ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [pytest]
2	+ pythonpath = src

src/__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (5 kB). View file

src/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (785 Bytes). View file

src/app/__init__.py ADDED Viewed

File without changes

src/app/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (198 Bytes). View file

src/app/__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (4.53 kB). View file

src/app/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (789 Bytes). View file

src/app/app.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from fastapi import FastAPI, HTTPException
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from .utils import preprocess
+from scipy.special import softmax
+import numpy as np
+from pydantic import BaseModel
+import urllib.request
+import csv
+import requests
+from typing import Union, List
+import torch
+app = FastAPI()
+class SentimentQuery(BaseModel):
+    input_texts: Union[str, List[str]]
+task='sentiment'
+mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
+with urllib.request.urlopen(mapping_link) as f:
+    html = f.read().decode('utf-8').split("\n")
+    csvreader = csv.reader(html, delimiter='\t')
+labels = [row[1] for row in csvreader if len(row) > 1]
+MODEL = f"cardiffnlp/twitter-roberta-base-{task}-latest"
+model = AutoModelForSequenceClassification.from_pretrained(MODEL)
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+@app.post("/predict")
+async def analyze_text(query:SentimentQuery):
+    if isinstance(query.input_texts, str):
+        input_texts = [query.input_texts]
+    else:  # already a List[str]
+        input_texts = query.input_texts
+    encoded_batch = tokenizer(
+        [preprocess(t) for t in input_texts],
+        padding=True,          # pad to same length
+        truncation=True,       # truncate long texts
+        return_tensors="pt",
+    )
+    with torch.no_grad():
+        output = model(**encoded_batch)
+    logits = output[0].detach().cpu().numpy()
+    scores = softmax(logits, axis=-1)
+    pred_labels = scores.argmax(axis=-1)
+    response_body = []
+    for i,text in enumerate(input_texts):
+        response_body.append(
+            {
+                "input_text":text,
+                "prediction":labels[pred_labels[i]],
+                "scores":
+                    {
+                        "negative": float(scores[i][0]),
+                        "neutral": float(scores[i][1]),
+                        "positive": float(scores[i][2])
+                    }
+            })
+    return {
+        "status" : "successful",
+        "response_body": response_body
+    }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

src/app/app_post.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import requests
+url = "http://127.0.0.1:8000/predict"
+data = {
+    "input_texts" : [
+            "Today I am feeling very happy!!",
+            "Today I am not feeling very happy at all!!",
+            "Today I am feeling no particular mood."]
+}
+response = requests.post(url, json=data)
+if response.status_code == 200:
+    response_json = response.json()
+    print(response_json["status"])
+    for message in response_json["response_body"]:
+        print(message)
+else:
+    print(f"error: {response.status_code} - {response.json()}")

src/app/utils.py ADDED Viewed

	@@ -0,0 +1,7 @@

+def preprocess(text):
+    new_text = []
+    for t in text.split(" "):
+        t = '@user' if t.startswith('@') and len(t) > 1 else t
+        t = 'http' if t.startswith('http') else t
+        new_text.append(t)
+    return " ".join(new_text)

tests/__pycache__/test_app.cpython-311-pytest-9.0.0.pyc ADDED Viewed

Binary file (23 kB). View file

tests/__pycache__/test_data.cpython-311-pytest-9.0.0.pyc ADDED Viewed

Binary file (416 Bytes). View file

tests/test_app.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from fastapi.testclient import TestClient
+from app.app import app
+client = TestClient(app)
+def test_correct_response_structure():
+    data = {
+    "input_texts" :
+            "Today I am feeling very happy!!"
+    }
+    response = client.post("/predict", json = data)
+    response_json = response.json()
+    assert response.status_code == 200
+    assert  "status" in response_json.keys()
+    assert  "response_body" in response_json.keys()
+    response_body = response_json["response_body"][0]
+    assert  "input_text" in response_body.keys()
+    assert  "prediction" in response_body.keys()
+    assert  "scores" in response_body.keys()
+def test_incorrect_response():
+    data = {
+    "input_texts" :
+            5
+    }
+    response = client.post("/predict", json = data)
+    response_json = response.json()
+    assert response.status_code == 422 # validation error by pedantic
+def test_single_prediction():
+    input_text = "Today I am feeling very happy!!"
+    data = {
+        "input_texts" : input_text
+    }
+    response = client.post("/predict", json = data)
+    response_json = response.json()
+    assert response.status_code == 200
+    assert response_json["status"] == "successful"
+    response_body = response_json["response_body"]
+    assert len(response_body) == 1
+    response_body = response_body[0]
+    assert response_body["input_text"] == input_text
+    assert response_body["prediction"] in ["positive", "negative", "neutral"]
+    scores = response_body["scores"]
+    assert type(scores)==dict
+    assert len(scores)==3
+    assert list(scores.keys()) == ["negative", "neutral", "positive"]
+    for sentiment in scores.keys():
+        assert type(scores[sentiment])==float
+def test_multiple_predictions():
+    input_texts = ["Today I am feeling very happy!!",
+                    "Today I am not feeling very happy at all!!",
+                    "Today I am feeling no particular mood."]
+    data = {
+        "input_texts" : input_texts
+    }
+    response = client.post("/predict", json = data)
+    response_json = response.json()
+    assert response.status_code == 200
+    assert response_json["status"] == "successful"
+    response_body = response_json["response_body"]
+    assert len(response_body) == len(input_texts)
+    for i in range(len(response_body)):
+        single_response = response_body[i]
+        assert single_response["input_text"] == input_texts[i]
+        assert single_response["prediction"] in ["positive", "negative", "neutral"]
+        scores = single_response["scores"]
+        assert type(scores)==dict
+        assert len(scores)==3
+        assert list(scores.keys()) == ["negative", "neutral", "positive"]
+        for sentiment in scores.keys():
+            assert type(scores[sentiment])==float

tests/test_data.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ import pytest
2	+
3	+ from datasets import load_dataset
4	+