Spaces:
Build error
Build error
| import streamlit as st | |
| from transformers import pipeline | |
| from transformers import AutoTokenizer,AutoModelForTokenClassification,AutoModelForSequenceClassification,BertForSequenceClassification | |
| import math | |
| import nltk | |
| import torch | |
| from nltk.corpus import stopwords | |
| import spacy | |
| from spacy import displacy | |
| from word2number import w2n | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import en_core_web_sm | |
| from annotated_text import annotated_text | |
| import datetime | |
| nlp = en_core_web_sm.load() | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| similarityModel = SentenceTransformer('BAAI/bge-small-en') | |
| sentiment_model = pipeline("text-classification", model="AhmedTaha012/managersFeedback-V1.0.7") | |
| tokenizerQuarter = AutoTokenizer.from_pretrained('AhmedTaha012/nextQuarter-status-V1.1.9') | |
| modelQuarter = BertForSequenceClassification.from_pretrained('AhmedTaha012/nextQuarter-status-V1.1.9') | |
| tokenizerTopic = AutoTokenizer.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification",use_fast=True,token="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") | |
| modelTopic = AutoModelForSequenceClassification.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification",token="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") | |
| # torch.compile(modelTopic) | |
| tokenizer = AutoTokenizer.from_pretrained("AhmedTaha012/finance-ner-v0.0.9-finetuned-ner") | |
| model = AutoModelForTokenClassification.from_pretrained("AhmedTaha012/finance-ner-v0.0.9-finetuned-ner") | |
| # torch.compile(model) | |
| # torch.compile(model) | |
| nlpPipe = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True) | |
| if "disabled" not in st.session_state: | |
| st.session_state["disabled"] = False | |
| def disable(): | |
| st.session_state["disabled"] = True | |
| def getSpeakers(data): | |
| if "Speakers" in data: | |
| return "\n".join([x for x in data.split("Speakers")[-1].split("\n") if "--" in x]) | |
| elif "Call participants" in data: | |
| return "\n".join([x for x in data.split("Call participants")[-1].split("\n") if "--" in x]) | |
| elif "Call Participants" in data: | |
| return "\n".join([x for x in data.split("Call Participants")[-1].split("\n") if "--" in x]) | |
| def removeSpeakers(data): | |
| if "Speakers" in data: | |
| return data.split("Speakers")[0] | |
| elif "Call participants" in data: | |
| return data.split("Call participants")[0] | |
| elif "Call Participants" in data: | |
| return data.split("Call Participants")[0] | |
| def getQA(data): | |
| if "Questions and Answers" in data: | |
| return data.split("Questions and Answers")[-1] | |
| elif "Questions & Answers" in data: | |
| return data.split("Questions & Answers")[-1] | |
| elif "Q&A" in data: | |
| return data.split("Q&A")[-1] | |
| else: | |
| return "" | |
| def removeQA(data): | |
| if "Questions and Answers" in data: | |
| return data.split("Questions and Answers")[0] | |
| elif "Questions & Answers" in data: | |
| return data.split("Questions & Answers")[0] | |
| elif "Q&A" in data: | |
| return data.split("Q&A")[0] | |
| else: | |
| return "" | |
| def clean_and_preprocess(text): | |
| text=[x for x in text.split("\n") if len(x)>100] | |
| l=[] | |
| for t in text: | |
| # Convert to lowercase | |
| t = t.lower() | |
| # Tokenize text into words | |
| words = nltk.word_tokenize(t) | |
| # Remove stopwords | |
| stop_words = set(stopwords.words('english')) | |
| filtered_words = [word for word in words if word not in stop_words] | |
| # Join the words back into a cleaned text | |
| cleaned_text = ' '.join(filtered_words) | |
| l.append(cleaned_text) | |
| return "\n".join(l) | |
| def replace_abbreviations(text): | |
| replacements = { | |
| 'Q1': 'first quarter', | |
| 'Q2': 'second quarter', | |
| 'Q3': 'third quarter', | |
| 'Q4': 'fourth quarter', | |
| 'q1': 'first quarter', | |
| 'q2': 'second quarter', | |
| 'q3': 'third quarter', | |
| 'q4': 'fourth quarter', | |
| 'FY': 'fiscal year', | |
| 'YoY': 'year over year', | |
| 'MoM': 'month over month', | |
| 'EBITDA': 'earnings before interest, taxes, depreciation, and amortization', | |
| 'ROI': 'return on investment', | |
| 'EPS': 'earnings per share', | |
| 'P/E': 'price-to-earnings', | |
| 'DCF': 'discounted cash flow', | |
| 'CAGR': 'compound annual growth rate', | |
| 'GDP': 'gross domestic product', | |
| 'CFO': 'chief financial officer', | |
| 'GAAP': 'generally accepted accounting principles', | |
| 'SEC': 'U.S. Securities and Exchange Commission', | |
| 'IPO': 'initial public offering', | |
| 'M&A': 'mergers and acquisitions', | |
| 'EBIT': 'earnings before interest and taxes', | |
| 'IRR': 'internal rate of return', | |
| 'ROA': 'return on assets', | |
| 'ROE': 'return on equity', | |
| 'NAV': 'net asset value', | |
| 'PE ratio': 'price-to-earnings ratio', | |
| 'EPS growth': 'earnings per share growth', | |
| 'Fiscal Year': 'financial year', | |
| 'CAPEX': 'capital expenditure', | |
| 'APR': 'annual percentage rate', | |
| 'P&L': 'profit and loss', | |
| 'NPM': 'net profit margin', | |
| 'EBT': 'earnings before taxes', | |
| 'EBITDAR': 'earnings before interest, taxes, depreciation, amortization, and rent', | |
| 'PAT': 'profit after tax', | |
| 'COGS': 'cost of goods sold', | |
| 'EBTIDA': 'earnings before taxes, interest, depreciation, and amortization', | |
| 'E&Y': 'Ernst & Young', | |
| 'B2B': 'business to business', | |
| 'B2C': 'business to consumer', | |
| 'LIFO': 'last in, first out', | |
| 'FIFO': 'first in, first out', | |
| 'FCF': 'free cash flow', | |
| 'LTM': 'last twelve months', | |
| 'OPEX': 'operating expenses', | |
| 'TSR': 'total shareholder return', | |
| 'PP&E': 'property, plant, and equipment', | |
| 'PBT': 'profit before tax', | |
| 'EBITDAR margin': 'earnings before interest, taxes, depreciation, amortization, and rent margin', | |
| 'ROIC': 'return on invested capital', | |
| 'EPS': 'earnings per share', | |
| 'P/E': 'price-to-earnings', | |
| 'EBITDA': 'earnings before interest, taxes, depreciation, and amortization', | |
| 'YOY': 'year-over-year', | |
| 'MOM': 'month-over-month', | |
| 'CAGR': 'compound annual growth rate', | |
| 'GDP': 'gross domestic product', | |
| 'ROI': 'return on investment', | |
| 'ROE': 'return on equity', | |
| 'EBIT': 'earnings before interest and taxes', | |
| 'DCF': 'discounted cash flow', | |
| 'GAAP': 'Generally Accepted Accounting Principles', | |
| 'LTM': 'last twelve months', | |
| 'EBIT margin': 'earnings before interest and taxes margin', | |
| 'EBT': 'earnings before taxes', | |
| 'EBTA': 'earnings before taxes and amortization', | |
| 'FTE': 'full-time equivalent', | |
| 'EBIDTA': 'earnings before interest, depreciation, taxes, and amortization', | |
| 'EBTIDA': 'earnings before taxes, interest, depreciation, and amortization', | |
| 'EBITDAR': 'earnings before interest, taxes, depreciation, amortization, and rent', | |
| 'COGS': 'cost of goods sold', | |
| 'APR': 'annual percentage rate', | |
| 'PESTEL': 'Political, Economic, Social, Technological, Environmental, and Legal', | |
| 'KPI': 'key performance indicator', | |
| 'SWOT': 'Strengths, Weaknesses, Opportunities, Threats', | |
| 'CAPEX': 'capital expenditures', | |
| 'EBITDARM': 'earnings before interest, taxes, depreciation, amortization, rent, and management fees', | |
| 'EBITDAX': 'earnings before interest, taxes, depreciation, amortization, and exploration expenses', | |
| 'EBITDAS': 'earnings before interest, taxes, depreciation, amortization, and restructuring costs', | |
| 'EBITDAX-C': 'earnings before interest, taxes, depreciation, amortization, exploration expenses, and commodity derivatives', | |
| 'EBITDAX-R': 'earnings before interest, taxes, depreciation, amortization, exploration expenses, and asset retirement obligations', | |
| 'EBITDAX-E': 'earnings before interest, taxes, depreciation, amortization, exploration expenses, and environmental liabilities' | |
| # Add more abbreviations and replacements as needed | |
| } | |
| for abbreviation, full_form in replacements.items(): | |
| text = text.replace(abbreviation, full_form) | |
| return text | |
| def clean_and_preprocess(text): | |
| text=[x for x in text.split("\n") if len(x)>100] | |
| l=[] | |
| for t in text: | |
| # Convert to lowercase | |
| t = t.lower() | |
| # Tokenize text into words | |
| words = nltk.word_tokenize(t) | |
| # Remove stopwords | |
| stop_words = set(stopwords.words('english')) | |
| filtered_words = [word for word in words if word not in stop_words] | |
| # Join the words back into a cleaned text | |
| cleaned_text = ' '.join(filtered_words) | |
| l.append(cleaned_text) | |
| return "\n".join(l) | |
| def convert_amount_to_number(amount_str): | |
| try: | |
| return w2n.word_to_num(amount_str) | |
| except ValueError: | |
| return 0 # Return 0 if the conversion fails | |
| def getTopic(encoded_input): | |
| # modelTopic.to("cuda") | |
| with torch.no_grad(): | |
| logits = modelTopic(**encoded_input).logits | |
| predicted_class_id = logits.argmax().item() | |
| return modelTopic.config.id2label[predicted_class_id] | |
| def selectedCorpusForNextQuarterModel(x,quarter,year): | |
| number_word_dict = { | |
| "1": "first", | |
| "2": "second", | |
| "3": "third", | |
| "4": "fourth", | |
| # Add more entries as needed | |
| } | |
| tokens=tokenizerTopic(x, padding=True, truncation=True, return_tensors='pt') | |
| splitSize=256 | |
| chunksInput_ids=[tokens["input_ids"][0][r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens["input_ids"][0])/splitSize))] | |
| chunksToken_type_ids=[tokens["token_type_ids"][0][r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens["token_type_ids"][0])/splitSize))] | |
| chunksAttention_mask=[tokens["attention_mask"][0][r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens["attention_mask"][0])/splitSize))] | |
| l=[] | |
| for idx in range(len(chunksInput_ids)): | |
| l.append({"input_ids":torch.tensor([list(chunksInput_ids[idx])]), | |
| "token_type_ids":torch.tensor([list(chunksToken_type_ids[idx])]), | |
| "attention_mask":torch.tensor([list(chunksAttention_mask[idx])]) | |
| }) | |
| selectedTopics = ["Stock Movement", "Earnings", "IPO", "Stock Commentary", "Currencies", "M&A | Investments", "Financials", "Macro", "Analyst Update", "Company | Product News"] | |
| result = [tokenizerTopic.decode(x["input_ids"][0], skip_special_tokens=True) for x in l if getTopic(x) in selectedTopics] | |
| result=[x for x in result if len(x)>10] | |
| des=f"the {number_word_dict[str(quarter)]} quarter results of the {year}" | |
| courpus=result | |
| embeddings_1 = similarityModel.encode([des]+courpus, normalize_embeddings=True,show_progress_bar=False) | |
| sents=[des]+courpus | |
| rest=[sents[f] for f in [list(cosine_similarity(embeddings_1)[0][1:]).index(value)+1 for value in sorted(list(cosine_similarity(embeddings_1)[0][1:]),reverse=True)][:3]] | |
| return ",".join(rest) | |
| def getQuarterPrediction(text): | |
| tokens=tokenizerQuarter(text,padding=True,max_length=512,return_overflowing_tokens=False,add_special_tokens=True,truncation=True,return_tensors="pt") | |
| with torch.no_grad(): | |
| logits = modelQuarter(**tokens).logits | |
| predicted_class_id = logits.argmax().item() | |
| return modelQuarter.config.id2label[predicted_class_id] | |
| def getSentence(listOfSentences,value): | |
| for sent in listOfSentences: | |
| if value in sent: | |
| return sent | |
| return value | |
| def get_annotated_text(text,value,entity): | |
| doc = nlp(text) | |
| doc.ents = [doc.char_span(text.index(value), text.index(value)+len(value), label=entity)] | |
| ent_html = displacy.render(doc, style='ent', jupyter=False)# Display the entity visualization in the browser: | |
| st.markdown(ent_html, unsafe_allow_html=True) | |
| return [text.split(value)[0],(value,entity),text.split(value)[1]] | |
| st.header("Transcript Analysis", divider='rainbow') | |
| mainTranscript = st.text_area("Enter the transcript:", height=100) | |
| doc = nlp(mainTranscript) | |
| sentences = [sent.text for sent in doc.sents] | |
| quarter= st.selectbox('Select your quarter',('1', '2', '3','4')) | |
| year = st.selectbox('Select your year',tuple([str(x) for x in range(int(datetime.datetime.now().year),1900,-1)])) | |
| if st.button("Analyze"): | |
| transcript=replace_abbreviations(mainTranscript) | |
| transcript=removeSpeakers(transcript) | |
| transcript=removeQA(transcript) | |
| transcript=clean_and_preprocess(transcript) | |
| tokens=transcript.split() | |
| splitSize=256 | |
| chunks=[tokens[r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens)/splitSize))] | |
| chunks=[" ".join(chuk) for chuk in chunks] | |
| st.subheader("Management Sentiment", divider='rainbow') | |
| sentiment = [sentiment_model(x)[0]['label'] for x in chunks] | |
| sentiment=max(sentiment,key=sentiment.count) | |
| sentiment_color = "green" if sentiment == "postive" else "red" | |
| st.markdown(f'<span style="color:{sentiment_color}">{sentiment}</span>', unsafe_allow_html=True) | |
| st.subheader("Next Quarter Perdiction", divider='rainbow') | |
| # increase_decrease = [increase_decrease_model(x)[0]['label'] for x in chunks] | |
| increase_decrease=getQuarterPrediction(selectedCorpusForNextQuarterModel(mainTranscript,quarter,year)) | |
| increase_decrease_color = "green" if increase_decrease == "Increase" else "red" | |
| st.markdown(f'<span style="color:{increase_decrease_color}">{increase_decrease}</span>', unsafe_allow_html=True) | |
| st.subheader("Financial Metrics", divider='rainbow') | |
| ner_result=[] | |
| savedchunks=[] | |
| idx=0 | |
| while idx<len(chunks): | |
| ents=nlpPipe(chunks[idx]) | |
| if len(ents)>=1: | |
| idxx=0 | |
| savedchunks.append(idx) | |
| while idxx<len(ents): | |
| if len(ents[idxx]["word"].split())==2: | |
| ner_result.append({ents[idxx]["entity_group"]:ents[idxx]["word"]}) | |
| elif len(ents[idxx]["word"].split())==1: | |
| try: | |
| ner_result.append({ents[idxx]["entity_group"]:ents[idxx]["word"]+ents[idxx+1]["word"]+ents[idxx+2]["word"]}) | |
| idxx=idxx+2 | |
| except: | |
| pass | |
| idxx=idxx+1 | |
| idx=idx+1 | |
| profits=[x["profit"] for x in ner_result if "profit" in x] | |
| revenues=[x["revenue"] for x in ner_result if "revenue" in x] | |
| expences=[x["expense"] for x in ner_result if "expense" in x] | |
| for idx in range(len(revenues)): | |
| st.text_input(f'Revenue:{idx+1}', revenues[idx]) | |
| # st.text_input(f'Revenue-Sentence:{idx+1}', getSentence(sentences,revenues[idx])) | |
| get_annotated_text(getSentence(sentences,revenues[idx]),str(revenues[idx]),"Revenue") | |
| for idx in range(len(profits)): | |
| st.text_input(f'Profit:{idx+1}', profits[idx]) | |
| # st.text_input(f'Profit-Sentence:{idx+1}', getSentence(sentences,profits[idx])) | |
| get_annotated_text(getSentence(sentences,profits[idx]),str(profits[idx]),"Profit") | |
| for idx in range(len(expences)): | |
| st.text_input(f'Expences:{idx+1}', expences[idx]) | |
| # st.text_input(f'Expences-Sentences:{idx+1}', getSentence(sentences,expences[idx])) | |
| get_annotated_text(getSentence(sentences,expences[idx]),str(expences[idx]),"Expences") | |
| st.subheader("Investment Recommendation", divider='rainbow') | |
| profitAmount=sum([convert_amount_to_number(x) for x in profits]) | |
| expencesAmount=sum([convert_amount_to_number(x) for x in expences]) | |
| if increase_decrease=="Increase" and sentiment=="postive" and profitAmount>expencesAmount: | |
| st.markdown(f'<span style="color:green">{"This is a great chance for investment. Do consider it."}</span>', unsafe_allow_html=True) | |
| else: | |
| st.markdown(f'<span style="color:red">{"Not the best chance for investment."}</span>', unsafe_allow_html=True) | |