Spaces:
Running
Running
lucabadiali
commited on
Commit
·
57fbf67
1
Parent(s):
c57b942
Tryed first dashboard
Browse files- prometheus.yml +2 -1
- src/app/__pycache__/config.cpython-311.pyc +0 -0
- src/app/app.py +133 -2
- src/app/config.py +20 -11
- src/app/utils.py +29 -1
- src/nb.ipynb +208 -0
- src/train_model.py +4 -10
prometheus.yml
CHANGED
|
@@ -3,10 +3,11 @@ global:
|
|
| 3 |
|
| 4 |
scrape_configs:
|
| 5 |
- job_name: "fastapi_hf"
|
| 6 |
-
scheme:
|
| 7 |
metrics_path: /metrics
|
| 8 |
static_configs:
|
| 9 |
- targets:
|
|
|
|
| 10 |
- "lucabadiali-ml-ops-project.hf.space:443"
|
| 11 |
# Se la Space è privata, aggiungi:
|
| 12 |
# authorization:
|
|
|
|
| 3 |
|
| 4 |
scrape_configs:
|
| 5 |
- job_name: "fastapi_hf"
|
| 6 |
+
scheme: http
|
| 7 |
metrics_path: /metrics
|
| 8 |
static_configs:
|
| 9 |
- targets:
|
| 10 |
+
#- "host.docker.internal:8000"
|
| 11 |
- "lucabadiali-ml-ops-project.hf.space:443"
|
| 12 |
# Se la Space è privata, aggiungi:
|
| 13 |
# authorization:
|
src/app/__pycache__/config.cpython-311.pyc
CHANGED
|
Binary files a/src/app/__pycache__/config.cpython-311.pyc and b/src/app/__pycache__/config.cpython-311.pyc differ
|
|
|
src/app/app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from fastapi import FastAPI, HTTPException
|
| 2 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 3 |
-
from .utils import preprocess
|
| 4 |
from scipy.special import softmax
|
| 5 |
import numpy as np
|
| 6 |
from pydantic import BaseModel
|
|
@@ -9,13 +9,57 @@ import csv
|
|
| 9 |
import requests
|
| 10 |
from typing import Union, List
|
| 11 |
import torch
|
| 12 |
-
from .config import MODEL_SOURCE, ModelSource
|
| 13 |
from prometheus_fastapi_instrumentator import Instrumentator
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
app = FastAPI()
|
| 17 |
Instrumentator().instrument(app).expose(app, endpoint="/metrics", include_in_schema=False)
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
class SentimentQuery(BaseModel):
|
| 21 |
input_texts: Union[str, List[str]]
|
|
@@ -27,6 +71,7 @@ with urllib.request.urlopen(mapping_link) as f:
|
|
| 27 |
labels = [row[1] for row in csvreader if len(row) > 1]
|
| 28 |
|
| 29 |
tokenizer, model = load_model_and_tokenizer(MODEL_SOURCE)
|
|
|
|
| 30 |
|
| 31 |
@app.get("/")
|
| 32 |
def read_root():
|
|
@@ -55,6 +100,11 @@ async def analyze_text(query:SentimentQuery):
|
|
| 55 |
|
| 56 |
response_body = []
|
| 57 |
for i,text in enumerate(input_texts):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
response_body.append(
|
| 59 |
{
|
| 60 |
"input_text":text,
|
|
@@ -73,6 +123,87 @@ async def analyze_text(query:SentimentQuery):
|
|
| 73 |
}
|
| 74 |
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
if __name__ == "__main__":
|
| 77 |
import uvicorn
|
| 78 |
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
| 1 |
from fastapi import FastAPI, HTTPException
|
| 2 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 3 |
+
from .utils import preprocess, load_model_and_tokenizer
|
| 4 |
from scipy.special import softmax
|
| 5 |
import numpy as np
|
| 6 |
from pydantic import BaseModel
|
|
|
|
| 9 |
import requests
|
| 10 |
from typing import Union, List
|
| 11 |
import torch
|
| 12 |
+
from .config import MODEL_SOURCE, ModelSource
|
| 13 |
from prometheus_fastapi_instrumentator import Instrumentator
|
| 14 |
|
| 15 |
+
##################
|
| 16 |
+
from prometheus_client import Counter, Gauge
|
| 17 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
| 18 |
+
from datetime import datetime
|
| 19 |
+
import os
|
| 20 |
+
import random
|
| 21 |
+
import pandas as pd
|
| 22 |
+
#################
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
#############
|
| 26 |
+
from .config import EVAL_BATCH_SIZE, N_SAMPLES, DATASET_PATH, EVAL_PERIOD_MIN
|
| 27 |
+
from .utils import load_dataset
|
| 28 |
+
###########
|
| 29 |
|
| 30 |
app = FastAPI()
|
| 31 |
Instrumentator().instrument(app).expose(app, endpoint="/metrics", include_in_schema=False)
|
| 32 |
|
| 33 |
+
###################
|
| 34 |
+
# ---------- Metrics (custom) ----------
|
| 35 |
+
# Production predictions distribution (unlabeled)
|
| 36 |
+
# PRED_COUNTER = Counter(
|
| 37 |
+
# "sentiment_requests_total",
|
| 38 |
+
# "Total predictions served by label",
|
| 39 |
+
# ["label"]
|
| 40 |
+
# )
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# EVAL_SAMPLE_SIZE = Gauge(
|
| 45 |
+
# "model_evaluation_sample_size",
|
| 46 |
+
# "Number of samples used in the latest periodic evaluation"
|
| 47 |
+
# )
|
| 48 |
+
# EVAL_COUNTER_DIST = Counter(
|
| 49 |
+
# "sentiment_test_distribution_total",
|
| 50 |
+
# "Cumulative predicted label counts on evaluation samples",
|
| 51 |
+
# ["label"]
|
| 52 |
+
# )
|
| 53 |
+
# EVAL_RUNS = Counter(
|
| 54 |
+
# "model_evaluations_total",
|
| 55 |
+
# "Total number of evaluation runs completed"
|
| 56 |
+
# )
|
| 57 |
+
##################
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
|
| 63 |
|
| 64 |
class SentimentQuery(BaseModel):
|
| 65 |
input_texts: Union[str, List[str]]
|
|
|
|
| 71 |
labels = [row[1] for row in csvreader if len(row) > 1]
|
| 72 |
|
| 73 |
tokenizer, model = load_model_and_tokenizer(MODEL_SOURCE)
|
| 74 |
+
model.eval()
|
| 75 |
|
| 76 |
@app.get("/")
|
| 77 |
def read_root():
|
|
|
|
| 100 |
|
| 101 |
response_body = []
|
| 102 |
for i,text in enumerate(input_texts):
|
| 103 |
+
|
| 104 |
+
predicted = labels[pred_labels[i]]
|
| 105 |
+
#PRED_COUNTER.labels(label=predicted).inc()
|
| 106 |
+
|
| 107 |
+
|
| 108 |
response_body.append(
|
| 109 |
{
|
| 110 |
"input_text":text,
|
|
|
|
| 123 |
}
|
| 124 |
|
| 125 |
|
| 126 |
+
|
| 127 |
+
def evaluate_accuracy():
|
| 128 |
+
dataset = load_dataset(DATASET_PATH).shuffle()["test"][:N_SAMPLES]
|
| 129 |
+
N_BATCHES = len(dataset["text"])//EVAL_BATCH_SIZE
|
| 130 |
+
|
| 131 |
+
accuracy = 0
|
| 132 |
+
for i in range(N_BATCHES+1):
|
| 133 |
+
if i == N_BATCHES :
|
| 134 |
+
samples, labels = dataset["text"][i*EVAL_BATCH_SIZE:], dataset["label"][i*EVAL_BATCH_SIZE:]
|
| 135 |
+
else:
|
| 136 |
+
samples, labels = dataset["text"][i*EVAL_BATCH_SIZE:(i+1)*EVAL_BATCH_SIZE], dataset["label"][i*EVAL_BATCH_SIZE:(i+1)*EVAL_BATCH_SIZE]
|
| 137 |
+
|
| 138 |
+
model.eval()
|
| 139 |
+
encoded_batch = tokenizer(
|
| 140 |
+
[preprocess(t) for t in samples],
|
| 141 |
+
padding=True, # pad to same length
|
| 142 |
+
truncation=True, # truncate long texts
|
| 143 |
+
return_tensors="pt",
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
with torch.no_grad():
|
| 147 |
+
output = model(**encoded_batch)
|
| 148 |
+
|
| 149 |
+
logits = output[0].detach().cpu().numpy()
|
| 150 |
+
scores = softmax(logits, axis=-1)
|
| 151 |
+
pred_labels = scores.argmax(axis=-1)
|
| 152 |
+
accuracy += sum(pred_labels==labels)
|
| 153 |
+
accuracy/=N_SAMPLES
|
| 154 |
+
return accuracy
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
# Evaluation metrics (labeled test set)
|
| 158 |
+
EVAL_ACCURACY = Gauge(
|
| 159 |
+
"model_evaluation_accuracy",
|
| 160 |
+
"Accuracy on latest periodic evaluation of labeled test subset"
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
| 164 |
+
from datetime import datetime, timedelta
|
| 165 |
+
import threading
|
| 166 |
+
|
| 167 |
+
_model_lock = threading.Lock()
|
| 168 |
+
|
| 169 |
+
def _run_eval_and_set_gauge():
|
| 170 |
+
# If you expect concurrent requests to /predict, the lock prevents GPU/torch contention
|
| 171 |
+
with _model_lock:
|
| 172 |
+
acc = evaluate_accuracy()
|
| 173 |
+
EVAL_ACCURACY.set(acc)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
scheduler = BackgroundScheduler(daemon=True)
|
| 177 |
+
|
| 178 |
+
@app.on_event("startup")
|
| 179 |
+
def _start_scheduler():
|
| 180 |
+
# run once soon after startup
|
| 181 |
+
scheduler.add_job(_run_eval_and_set_gauge, next_run_time=datetime.now() + timedelta(seconds=2))
|
| 182 |
+
# then every EVAL_PERIOD_MIN minutes
|
| 183 |
+
scheduler.add_job(_run_eval_and_set_gauge, "interval", minutes=EVAL_PERIOD_MIN)
|
| 184 |
+
scheduler.start()
|
| 185 |
+
|
| 186 |
+
@app.on_event("shutdown")
|
| 187 |
+
def _stop_scheduler():
|
| 188 |
+
scheduler.shutdown(wait=False)
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
|
| 207 |
if __name__ == "__main__":
|
| 208 |
import uvicorn
|
| 209 |
uvicorn.run(app, host="0.0.0.0", port=8000)
|
src/app/config.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import os
|
| 2 |
from enum import Enum
|
| 3 |
-
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 4 |
from pathlib import Path
|
| 5 |
|
| 6 |
|
|
@@ -10,15 +9,25 @@ class ModelSource(str, Enum):
|
|
| 10 |
|
| 11 |
MODEL_SOURCE = ModelSource(os.getenv("MODEL_SOURCE", "hf"))
|
| 12 |
HF_MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
from enum import Enum
|
|
|
|
| 3 |
from pathlib import Path
|
| 4 |
|
| 5 |
|
|
|
|
| 9 |
|
| 10 |
MODEL_SOURCE = ModelSource(os.getenv("MODEL_SOURCE", "hf"))
|
| 11 |
HF_MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
|
| 12 |
+
DATASET_PATH = Path("data/dataset")
|
| 13 |
|
| 14 |
|
| 15 |
+
EVAL_SAMPLE_SIZE = int(os.getenv("EVAL_SAMPLE_SIZE", "80"))
|
| 16 |
+
EVAL_INTERVAL_HOURS = float(os.getenv("EVAL_INTERVAL_HOURS", "1"))
|
| 17 |
+
RANDOM_SEED = int(os.getenv("RANDOM_SEED", "42"))
|
| 18 |
+
|
| 19 |
+
EVAL_BATCH_SIZE = 64
|
| 20 |
+
N_SAMPLES = 500
|
| 21 |
+
EVAL_PERIOD_MIN = 1
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# def load_model_and_tokenizer(MODEL_SOURCE):
|
| 25 |
+
# if MODEL_SOURCE == ModelSource.HF: # use the latest model available in the HF hub
|
| 26 |
+
# tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
|
| 27 |
+
# model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL)
|
| 28 |
+
# else: # use a locally fine tuned model
|
| 29 |
+
# local_model_path = Path("models/saved_model")
|
| 30 |
+
# assert local_model_path.exists(), """No local model was found. Run 'python3 src/train_model.py' first"""
|
| 31 |
+
# tokenizer = AutoTokenizer.from_pretrained("models/saved_tokenizer")
|
| 32 |
+
# model = AutoModelForSequenceClassification.from_pretrained("models/saved_model")
|
| 33 |
+
# return tokenizer, model
|
src/app/utils.py
CHANGED
|
@@ -1,7 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
def preprocess(text):
|
| 2 |
new_text = []
|
| 3 |
for t in text.split(" "):
|
| 4 |
t = '@user' if t.startswith('@') and len(t) > 1 else t
|
| 5 |
t = 'http' if t.startswith('http') else t
|
| 6 |
new_text.append(t)
|
| 7 |
-
return " ".join(new_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from .config import ModelSource, HF_MODEL
|
| 3 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 4 |
+
from datasets import load_dataset as hf_load_dataset
|
| 5 |
+
from datasets import load_from_disk
|
| 6 |
+
|
| 7 |
+
|
| 8 |
def preprocess(text):
|
| 9 |
new_text = []
|
| 10 |
for t in text.split(" "):
|
| 11 |
t = '@user' if t.startswith('@') and len(t) > 1 else t
|
| 12 |
t = 'http' if t.startswith('http') else t
|
| 13 |
new_text.append(t)
|
| 14 |
+
return " ".join(new_text)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def load_model_and_tokenizer(MODEL_SOURCE):
|
| 19 |
+
if MODEL_SOURCE == ModelSource.HF: # use the latest model available in the HF hub
|
| 20 |
+
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
|
| 21 |
+
model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL)
|
| 22 |
+
else: # use a locally fine tuned model
|
| 23 |
+
local_model_path = Path("models/saved_model")
|
| 24 |
+
assert local_model_path.exists(), """No local model was found. Run 'python3 src/train_model.py' first"""
|
| 25 |
+
tokenizer = AutoTokenizer.from_pretrained("models/saved_tokenizer")
|
| 26 |
+
model = AutoModelForSequenceClassification.from_pretrained("models/saved_model")
|
| 27 |
+
return tokenizer, model
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def load_dataset(dataset_path):
|
| 31 |
+
if dataset_path.exists():
|
| 32 |
+
dataset = load_from_disk(dataset_path)
|
| 33 |
+
else:
|
| 34 |
+
dataset = hf_load_dataset('tweet_eval', 'sentiment')
|
| 35 |
+
return dataset
|
src/nb.ipynb
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 48,
|
| 6 |
+
"id": "7aaceacb",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"from pathlib import Path\n",
|
| 11 |
+
"from app.config import DATASET_PATH, MODEL_SOURCE\n",
|
| 12 |
+
"from app.utils import load_dataset, load_model_and_tokenizer, preprocess\n",
|
| 13 |
+
"from scipy.special import softmax\n"
|
| 14 |
+
]
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"cell_type": "code",
|
| 18 |
+
"execution_count": 49,
|
| 19 |
+
"id": "7defab3e",
|
| 20 |
+
"metadata": {},
|
| 21 |
+
"outputs": [
|
| 22 |
+
{
|
| 23 |
+
"name": "stderr",
|
| 24 |
+
"output_type": "stream",
|
| 25 |
+
"text": [
|
| 26 |
+
"Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
|
| 27 |
+
"- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
| 28 |
+
"- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
|
| 29 |
+
]
|
| 30 |
+
}
|
| 31 |
+
],
|
| 32 |
+
"source": [
|
| 33 |
+
"tokenizer, model = load_model_and_tokenizer(MODEL_SOURCE)\n"
|
| 34 |
+
]
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"cell_type": "code",
|
| 38 |
+
"execution_count": 24,
|
| 39 |
+
"id": "0a1dcfdd",
|
| 40 |
+
"metadata": {},
|
| 41 |
+
"outputs": [],
|
| 42 |
+
"source": [
|
| 43 |
+
"dataset = load_dataset(DATASET_PATH).shuffle()[\"test\"][:N_SAMPLES]\n"
|
| 44 |
+
]
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"cell_type": "code",
|
| 48 |
+
"execution_count": 33,
|
| 49 |
+
"id": "501e6728",
|
| 50 |
+
"metadata": {},
|
| 51 |
+
"outputs": [],
|
| 52 |
+
"source": [
|
| 53 |
+
"import torch"
|
| 54 |
+
]
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"cell_type": "code",
|
| 58 |
+
"execution_count": 47,
|
| 59 |
+
"id": "82b25de1",
|
| 60 |
+
"metadata": {},
|
| 61 |
+
"outputs": [
|
| 62 |
+
{
|
| 63 |
+
"data": {
|
| 64 |
+
"text/plain": [
|
| 65 |
+
"2"
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
"execution_count": 47,
|
| 69 |
+
"metadata": {},
|
| 70 |
+
"output_type": "execute_result"
|
| 71 |
+
}
|
| 72 |
+
],
|
| 73 |
+
"source": [
|
| 74 |
+
"N_BEVAL_BATCH_SIZE = 64\n",
|
| 75 |
+
"N_SAMPLES = 500\n",
|
| 76 |
+
"N_BATCHES = len(dataset[\"text\"])//EVAL_BATCH_SIZE\n",
|
| 77 |
+
"N_BATCHES"
|
| 78 |
+
]
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"cell_type": "code",
|
| 82 |
+
"execution_count": 54,
|
| 83 |
+
"id": "7dd5371b",
|
| 84 |
+
"metadata": {},
|
| 85 |
+
"outputs": [
|
| 86 |
+
{
|
| 87 |
+
"name": "stdout",
|
| 88 |
+
"output_type": "stream",
|
| 89 |
+
"text": [
|
| 90 |
+
"0 64\n",
|
| 91 |
+
"64 128\n",
|
| 92 |
+
"128 192\n",
|
| 93 |
+
"192 256\n",
|
| 94 |
+
"256 320\n",
|
| 95 |
+
"320 384\n",
|
| 96 |
+
"384 448\n",
|
| 97 |
+
"448 500\n"
|
| 98 |
+
]
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"data": {
|
| 102 |
+
"text/plain": [
|
| 103 |
+
"np.float64(0.71)"
|
| 104 |
+
]
|
| 105 |
+
},
|
| 106 |
+
"execution_count": 54,
|
| 107 |
+
"metadata": {},
|
| 108 |
+
"output_type": "execute_result"
|
| 109 |
+
}
|
| 110 |
+
],
|
| 111 |
+
"source": [
|
| 112 |
+
"EVAL_BATCH_SIZE = 64\n",
|
| 113 |
+
"N_SAMPLES = 500\n",
|
| 114 |
+
"def evaluate_accuracy():\n",
|
| 115 |
+
"\n",
|
| 116 |
+
" dataset = load_dataset(DATASET_PATH).shuffle()[\"test\"][:N_SAMPLES]\n",
|
| 117 |
+
" N_BATCHES = len(dataset[\"text\"])//EVAL_BATCH_SIZE\n",
|
| 118 |
+
"\n",
|
| 119 |
+
" accuracy = 0\n",
|
| 120 |
+
" for i in range(N_BATCHES+1):\n",
|
| 121 |
+
"\n",
|
| 122 |
+
" start = i*EVAL_BATCH_SIZE\n",
|
| 123 |
+
" end = min(N_SAMPLES, (i+1)*EVAL_BATCH_SIZE)\n",
|
| 124 |
+
" print(start, end)\n",
|
| 125 |
+
" samples, labels = dataset[\"text\"][start:end], dataset[\"label\"][start:end]\n",
|
| 126 |
+
" \n",
|
| 127 |
+
" model.eval()\n",
|
| 128 |
+
" encoded_batch = tokenizer(\n",
|
| 129 |
+
" [preprocess(t) for t in samples],\n",
|
| 130 |
+
" padding=True, # pad to same length\n",
|
| 131 |
+
" truncation=True, # truncate long texts\n",
|
| 132 |
+
" return_tensors=\"pt\",\n",
|
| 133 |
+
" )\n",
|
| 134 |
+
"\n",
|
| 135 |
+
" with torch.no_grad():\n",
|
| 136 |
+
" output = model(**encoded_batch)\n",
|
| 137 |
+
" \n",
|
| 138 |
+
" logits = output[0].detach().cpu().numpy()\n",
|
| 139 |
+
" scores = softmax(logits, axis=-1)\n",
|
| 140 |
+
" pred_labels = scores.argmax(axis=-1)\n",
|
| 141 |
+
" accuracy += sum(pred_labels==labels)\n",
|
| 142 |
+
" accuracy/=N_SAMPLES\n",
|
| 143 |
+
" return accuracy\n",
|
| 144 |
+
"evaluate_accuracy()"
|
| 145 |
+
]
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"cell_type": "code",
|
| 149 |
+
"execution_count": 1,
|
| 150 |
+
"id": "dbd3bb8c",
|
| 151 |
+
"metadata": {},
|
| 152 |
+
"outputs": [],
|
| 153 |
+
"source": [
|
| 154 |
+
"def _load_test_data():\n",
|
| 155 |
+
" \"\"\"\n",
|
| 156 |
+
" Expects CSV with columns: text,label\n",
|
| 157 |
+
" label values must be one of labels (negative, neutral, positive) or their indices (0,1,2).\n",
|
| 158 |
+
" \"\"\"\n",
|
| 159 |
+
" df = pd.read_csv(TEST_DATA_PATH)\n",
|
| 160 |
+
" # normalize label column to strings matching our 'labels' list\n",
|
| 161 |
+
" if np.issubdtype(df[\"label\"].dtype, np.number):\n",
|
| 162 |
+
" df[\"label\"] = df[\"label\"].astype(int).map(lambda i: labels[i])\n",
|
| 163 |
+
" else:\n",
|
| 164 |
+
" df[\"label\"] = df[\"label\"].str.lower().str.strip()\n",
|
| 165 |
+
" # keep only supported labels\n",
|
| 166 |
+
" df = df[df[\"label\"].isin(labels)].dropna(subset=[\"text\", \"label\"])\n",
|
| 167 |
+
" return df"
|
| 168 |
+
]
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"cell_type": "code",
|
| 172 |
+
"execution_count": null,
|
| 173 |
+
"id": "ec0b086e",
|
| 174 |
+
"metadata": {},
|
| 175 |
+
"outputs": [],
|
| 176 |
+
"source": []
|
| 177 |
+
},
|
| 178 |
+
{
|
| 179 |
+
"cell_type": "code",
|
| 180 |
+
"execution_count": null,
|
| 181 |
+
"id": "800c8018",
|
| 182 |
+
"metadata": {},
|
| 183 |
+
"outputs": [],
|
| 184 |
+
"source": []
|
| 185 |
+
}
|
| 186 |
+
],
|
| 187 |
+
"metadata": {
|
| 188 |
+
"kernelspec": {
|
| 189 |
+
"display_name": "ProjectEnv",
|
| 190 |
+
"language": "python",
|
| 191 |
+
"name": "python3"
|
| 192 |
+
},
|
| 193 |
+
"language_info": {
|
| 194 |
+
"codemirror_mode": {
|
| 195 |
+
"name": "ipython",
|
| 196 |
+
"version": 3
|
| 197 |
+
},
|
| 198 |
+
"file_extension": ".py",
|
| 199 |
+
"mimetype": "text/x-python",
|
| 200 |
+
"name": "python",
|
| 201 |
+
"nbconvert_exporter": "python",
|
| 202 |
+
"pygments_lexer": "ipython3",
|
| 203 |
+
"version": "3.11.10"
|
| 204 |
+
}
|
| 205 |
+
},
|
| 206 |
+
"nbformat": 4,
|
| 207 |
+
"nbformat_minor": 5
|
| 208 |
+
}
|
src/train_model.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from app.utils import preprocess
|
| 2 |
import urllib
|
| 3 |
import csv
|
| 4 |
import os
|
|
@@ -8,8 +8,8 @@ from transformers import (
|
|
| 8 |
TrainingArguments, Trainer, EarlyStoppingCallback,
|
| 9 |
DataCollatorWithPadding
|
| 10 |
)
|
| 11 |
-
from datasets import load_dataset,load_from_disk
|
| 12 |
from pathlib import Path
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
# --- Device detection ---
|
|
@@ -73,14 +73,8 @@ model.gradient_checkpointing_enable()
|
|
| 73 |
model.config.use_cache = False
|
| 74 |
|
| 75 |
#### DATASET LOADING
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
dataset_path = Path("data/dataset")
|
| 79 |
-
if dataset_path.exists():
|
| 80 |
-
dataset = load_from_disk(dataset_path)
|
| 81 |
-
else:
|
| 82 |
-
dataset = load_dataset('tweet_eval', 'sentiment')
|
| 83 |
-
|
| 84 |
|
| 85 |
|
| 86 |
# ---- COPY-PASTE FROM HERE ----
|
|
|
|
| 1 |
+
from app.utils import preprocess, load_dataset
|
| 2 |
import urllib
|
| 3 |
import csv
|
| 4 |
import os
|
|
|
|
| 8 |
TrainingArguments, Trainer, EarlyStoppingCallback,
|
| 9 |
DataCollatorWithPadding
|
| 10 |
)
|
|
|
|
| 11 |
from pathlib import Path
|
| 12 |
+
from app.config import DATASET_PATH
|
| 13 |
|
| 14 |
|
| 15 |
# --- Device detection ---
|
|
|
|
| 73 |
model.config.use_cache = False
|
| 74 |
|
| 75 |
#### DATASET LOADING
|
| 76 |
+
|
| 77 |
+
dataset = load_dataset(DATASET_PATH)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
|
| 80 |
# ---- COPY-PASTE FROM HERE ----
|