File size: 10,748 Bytes
b190b45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#!/usr/bin/env python3
"""

Hugging Face Inference API Client - REAL DATA ONLY

Uses real Hugging Face models for sentiment analysis

NO MOCK DATA - All predictions from real HF models

"""

import httpx
import logging
import os
from typing import Dict, Any, Optional
from datetime import datetime
from fastapi import HTTPException

logger = logging.getLogger(__name__)


class HuggingFaceInferenceClient:
    """

    Real Hugging Face Inference API Client

    Primary source for real sentiment analysis using NLP models

    """
    
    def __init__(self):
        # Strip whitespace from token to avoid "Illegal header value" errors
        self.api_token = (os.getenv("HF_API_TOKEN") or os.getenv("HF_TOKEN") or "").strip()
        self.base_url = "https://router.huggingface.co/models"
        self.timeout = 30.0  # HF models can take time to load
        
        # Real sentiment analysis models
        self.models = {
            "sentiment_crypto": "cardiffnlp/twitter-roberta-base-sentiment-latest",
            "sentiment_financial": "ProsusAI/finbert",
            "sentiment_twitter": "finiteautomata/bertweet-base-sentiment-analysis",
            "sentiment_general": "nlptown/bert-base-multilingual-uncased-sentiment"
        }
        
        self.headers = {
            "Content-Type": "application/json"
        }
        if self.api_token:
            self.headers["Authorization"] = f"Bearer {self.api_token}"
    
    def _normalize_sentiment_label(self, label: str, score: float) -> tuple[str, str]:
        """

        Normalize different model label formats to standard format

        

        Returns:

            (normalized_label, sentiment_text)

        """
        label_upper = label.upper()
        
        # Map various label formats
        if label_upper in ["POSITIVE", "LABEL_2", "5 STARS", "POS"]:
            return ("POSITIVE", "positive")
        elif label_upper in ["NEGATIVE", "LABEL_0", "1 STAR", "NEG"]:
            return ("NEGATIVE", "negative")
        elif label_upper in ["NEUTRAL", "LABEL_1", "3 STARS", "NEU"]:
            return ("NEUTRAL", "neutral")
        
        # For star ratings (1-5 stars)
        if "STAR" in label_upper:
            if "4" in label or "5" in label:
                return ("POSITIVE", "positive")
            elif "1" in label or "2" in label:
                return ("NEGATIVE", "negative")
            else:
                return ("NEUTRAL", "neutral")
        
        # Default: use score to determine sentiment
        if score > 0.6:
            return ("POSITIVE", "positive")
        elif score < 0.4:
            return ("NEGATIVE", "negative")
        else:
            return ("NEUTRAL", "neutral")
    
    async def analyze_sentiment(

        self,

        text: str,

        model_key: str = "sentiment_crypto"

    ) -> Dict[str, Any]:
        """

        Analyze REAL sentiment using Hugging Face models

        

        Args:

            text: Text to analyze

            model_key: Model to use (sentiment_crypto, sentiment_financial, etc.)

        

        Returns:

            Real sentiment analysis results

        """
        try:
            # Get model name
            model_name = self.models.get(model_key, self.models["sentiment_crypto"])
            
            # Validate input
            if not text or len(text.strip()) == 0:
                raise HTTPException(
                    status_code=400,
                    detail="Missing or invalid text in request body"
                )
            
            # Truncate text if too long (max 512 tokens ~ 2000 chars)
            if len(text) > 2000:
                text = text[:2000]
            
            async with httpx.AsyncClient(timeout=self.timeout) as client:
                response = await client.post(
                    f"{self.base_url}/{model_name}",
                    headers=self.headers,
                    json={"inputs": text}
                )
                
                # Handle model loading state
                if response.status_code == 503:
                    # Model is loading
                    try:
                        error_data = response.json()
                        estimated_time = error_data.get("estimated_time", 20)
                        
                        logger.warning(
                            f"⏳ HuggingFace model {model_name} is loading "
                            f"(estimated: {estimated_time}s)"
                        )
                        
                        return {
                            "error": "Model is currently loading",
                            "estimated_time": estimated_time,
                            "model": model_name,
                            "timestamp": int(datetime.utcnow().timestamp() * 1000)
                        }
                    except:
                        return {
                            "error": "Model is currently loading",
                            "estimated_time": 20,
                            "model": model_name,
                            "timestamp": int(datetime.utcnow().timestamp() * 1000)
                        }
                
                response.raise_for_status()
                data = response.json()
                
                # Parse model response
                # HF returns: [[{"label": "POSITIVE", "score": 0.95}, ...]]
                if isinstance(data, list) and len(data) > 0:
                    # Get first (or highest score) prediction
                    if isinstance(data[0], list):
                        predictions = data[0]
                    else:
                        predictions = data
                    
                    # Get prediction with highest score
                    best_prediction = max(predictions, key=lambda x: x.get("score", 0))
                    
                    raw_label = best_prediction.get("label", "NEUTRAL")
                    raw_score = best_prediction.get("score", 0.5)
                    
                    # Normalize label
                    normalized_label, sentiment_text = self._normalize_sentiment_label(
                        raw_label,
                        raw_score
                    )
                    
                    result = {
                        "label": normalized_label,
                        "score": raw_score,
                        "sentiment": sentiment_text,
                        "confidence": raw_score,
                        "text": text[:100] + ("..." if len(text) > 100 else ""),
                        "model": model_name,
                        "source": "huggingface",
                        "timestamp": int(datetime.utcnow().timestamp() * 1000)
                    }
                    
                    logger.info(
                        f"✅ HuggingFace: Sentiment analysis completed "
                        f"({normalized_label}, confidence: {raw_score:.2f})"
                    )
                    return result
                
                else:
                    # Unexpected response format
                    logger.error(f"❌ HuggingFace: Unexpected response format: {data}")
                    raise HTTPException(
                        status_code=500,
                        detail="Unexpected response format from model"
                    )
        
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 503:
                # Model loading - already handled above
                return {
                    "error": "Model is currently loading",
                    "estimated_time": 20,
                    "timestamp": int(datetime.utcnow().timestamp() * 1000)
                }
            elif e.response.status_code == 400:
                logger.error(f"❌ HuggingFace: Bad request: {e}")
                raise HTTPException(
                    status_code=400,
                    detail="Invalid text or parameters"
                )
            elif e.response.status_code in (404, 410):
                # Endpoint moved or model not available on old host; provide safe fallback
                logger.warning("⚠ HuggingFace endpoint returned 404/410; using keyword fallback")
                # Simple keyword-based sentiment fallback
                text_lower = (text or "").lower()
                pos_kw = ["bull", "up", "gain", "profit", "surge", "rally", "strong"]
                neg_kw = ["bear", "down", "loss", "drop", "dump", "sell", "weak"]
                pos_score = sum(k in text_lower for k in pos_kw)
                neg_score = sum(k in text_lower for k in neg_kw)
                if pos_score > neg_score:
                    label, sentiment = ("POSITIVE", "positive")
                    score = 0.7
                elif neg_score > pos_score:
                    label, sentiment = ("NEGATIVE", "negative")
                    score = 0.7
                else:
                    label, sentiment = ("NEUTRAL", "neutral")
                    score = 0.5
                return {
                    "label": label,
                    "score": score,
                    "sentiment": sentiment,
                    "confidence": score,
                    "text": text[:100] + ("..." if len(text) > 100 else ""),
                    "model": "fallback-keywords",
                    "source": "fallback",
                    "timestamp": int(datetime.utcnow().timestamp() * 1000)
                }
            else:
                logger.error(f"❌ HuggingFace API HTTP error: {e}")
                raise HTTPException(
                    status_code=503,
                    detail=f"HuggingFace API temporarily unavailable: {str(e)}"
                )
        
        except httpx.HTTPError as e:
            logger.error(f"❌ HuggingFace API HTTP error: {e}")
            raise HTTPException(
                status_code=503,
                detail=f"HuggingFace API temporarily unavailable: {str(e)}"
            )
        
        except HTTPException:
            raise
        
        except Exception as e:
            logger.error(f"❌ HuggingFace sentiment analysis failed: {e}")
            raise HTTPException(
                status_code=500,
                detail=f"Failed to analyze sentiment: {str(e)}"
            )


# Global instance
hf_inference_client = HuggingFaceInferenceClient()


__all__ = ["HuggingFaceInferenceClient", "hf_inference_client"]