Spaces:
Runtime error
Runtime error
| import os | |
| import torch | |
| import torch.nn as nn | |
| import numpy as np | |
| import pandas as pd | |
| import json | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler | |
| from sentence_transformers import SentenceTransformer, util | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline | |
| from symspellpy import SymSpell, Verbosity | |
| import gradio as gr | |
| # Ensure Hugging Face cache directory is writable | |
| os.environ["TRANSFORMERS_CACHE"] = "/home/user/.cache/huggingface" | |
| # Set device | |
| device = torch.device("cpu") | |
| # Define DiseaseClassifier Model | |
| class DiseaseClassifier(nn.Module): | |
| def __init__(self, input_size, num_classes, dropout_rate=0.35665610394511454): | |
| super(DiseaseClassifier, self).__init__() | |
| self.fc1 = nn.Linear(input_size, 382) | |
| self.fc2 = nn.Linear(382, 389) | |
| self.fc3 = nn.Linear(389, 433) | |
| self.fc4 = nn.Linear(433, num_classes) | |
| self.activation = nn.LeakyReLU() | |
| self.dropout = nn.Dropout(dropout_rate) | |
| def forward(self, x): | |
| x = self.activation(self.fc1(x)) | |
| x = self.dropout(x) | |
| x = self.activation(self.fc2(x)) | |
| x = self.dropout(x) | |
| x = self.activation(self.fc3(x)) | |
| x = self.dropout(x) | |
| x = self.fc4(x) # Logits | |
| return x | |
| # Define DiseasePredictionModel | |
| class DiseasePredictionModel: | |
| def __init__(self, ai_model_name="model.pth", data_file="data.csv", symptom_json="symptoms.json", dictionary_file="frequency_dictionary_en_82_765.txt"): | |
| # Load dataset | |
| self.df = pd.read_csv(data_file) | |
| self.symptom_columns = self.load_symptoms(symptom_json) | |
| self.label_encoder = LabelEncoder() | |
| self.label_encoder.fit(self.df.iloc[:, 0]) | |
| self.scaler = StandardScaler() | |
| self.scaler.fit(self.df.iloc[:, 1:].values) | |
| self.input_size = len(self.symptom_columns) | |
| self.num_classes = len(self.label_encoder.classes_) | |
| self.model = self._load_model(ai_model_name) | |
| self.SYMPTOM_LIST = self.load_symptoms(symptom_json) | |
| # Load SymSpell dictionary | |
| self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) | |
| self.sym_spell.load_dictionary(dictionary_file, term_index=0, count_index=1) | |
| # Load BioBERT tokenizer and model | |
| self.tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1") | |
| self.nlp_model = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-v1.1") | |
| self.ner_pipeline = pipeline("ner", model=self.nlp_model, tokenizer=self.tokenizer, aggregation_strategy="simple") | |
| # Load Sentence Transformer | |
| self.semantic_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| def _load_model(self, ai_model_name): | |
| model = DiseaseClassifier(self.input_size, self.num_classes).to(device) | |
| model.load_state_dict(torch.load(ai_model_name, map_location=device)) | |
| model.eval() | |
| return model | |
| def predict_disease(self, symptoms): | |
| input_vector = np.zeros(len(self.symptom_columns)) | |
| for symptom in symptoms: | |
| if symptom in self.symptom_columns: | |
| input_vector[list(self.symptom_columns).index(symptom)] = 1 | |
| input_vector = self.scaler.transform([input_vector]) | |
| input_tensor = torch.tensor(input_vector, dtype=torch.float32).to(device) | |
| with torch.no_grad(): | |
| outputs = self.model(input_tensor) | |
| _, predicted_class = torch.max(outputs, 1) | |
| predicted_disease = self.label_encoder.inverse_transform([predicted_class.cpu().numpy()[0]])[0] | |
| return predicted_disease | |
| def load_symptoms(self, json_file): | |
| with open(json_file, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| def correct_text(self, text): | |
| words = text.split() | |
| corrected_words = [] | |
| for word in words: | |
| if word.lower() in [symptom.lower() for symptom in self.SYMPTOM_LIST]: | |
| corrected_words.append(word) | |
| else: | |
| suggestions = self.sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2) | |
| if suggestions: | |
| corrected_words.append(suggestions[0].term) | |
| else: | |
| corrected_words.append(word) | |
| return ' '.join(corrected_words) | |
| def extract_symptoms(self, text): | |
| ner_results = self.ner_pipeline(text) | |
| symptoms = set() | |
| for entity in ner_results: | |
| if entity["entity_group"] == "DISEASE": | |
| symptoms.add(entity["word"].lower()) | |
| return list(symptoms) | |
| def match_symptoms(self, extracted_symptoms): | |
| matched = {} | |
| symptom_embeddings = self.semantic_model.encode(self.SYMPTOM_LIST, convert_to_tensor=True) | |
| for symptom in extracted_symptoms: | |
| symptom_embedding = self.semantic_model.encode(symptom, convert_to_tensor=True) | |
| similarities = util.pytorch_cos_sim(symptom_embedding, symptom_embeddings)[0] | |
| most_similar_idx = similarities.argmax() | |
| best_match = self.SYMPTOM_LIST[most_similar_idx] | |
| matched[symptom] = best_match | |
| return matched.values() | |
| # Initialize Model | |
| model = DiseasePredictionModel() | |
| # Define Prediction Function | |
| def predict(symptoms): | |
| corrected = model.correct_text(symptoms) | |
| extracted = model.extract_symptoms(corrected) | |
| matched = model.match_symptoms(extracted) | |
| prediction = model.predict_disease(matched) | |
| return prediction | |
| # Define Gradio Interface | |
| iface = gr.Interface(fn=predict, inputs="text", outputs="text", title="Disease Prediction AI") | |
| iface.launch() | |