Spaces:

SimulaMet-HOST
/

SoccerRAG

Running

App Files Files Community

buzzCraft commited on May 7, 2024

Commit

291bc70

1 Parent(s): 0e37947

Adding ChainLit demo

Browse files

Files changed (13) hide show

.env_demo +5 -3
.gitignore +11 -0
README.md +16 -2
app.py +176 -0
chainlit.md +14 -0
extractor.py +560 -0
main.py +9 -2
main_cli.py +27 -0
media/chainlit.png +0 -0
requirements.txt +3 -1
src/database.py +45 -38
src/extractor.py +103 -52
src/sql_chain.py +40 -15

.env_demo CHANGED Viewed

@@ -1,4 +1,6 @@
-OPENAI_API_KEY=API_KEY_HERE
 LANGSMITH = False
-LANGSMITH_API_KEY=API_KEY_HERE -NOT NEEDED IF LANGSMITH IS FALSE
-```

+OPENAI_API_KEY=OPENAI_API_KEY
+OPENAI_MODEL = gpt-3.5-turbo-0125
+DATABASE_PATH = data/gamess.db
 LANGSMITH = False
+LANGSMITH_API_KEY=
+LANGSMITH_PROJECT=SoccerRag

.gitignore CHANGED Viewed

	@@ -1,2 +1,13 @@
1
2	*.pyc

 *.pyc
+.env
+.chainlit/config.toml
+.chainlit/translations/en-US.json
+.idea/inspectionProfiles/profiles_settings.xml
+.idea/inspectionProfiles/Project_Default.xml
+.idea/misc.xml
+.idea/modules.xml
+.idea/soccer-rag.iml
+.idea/vcs.xml
+extractor.log
+data/games.db

README.md CHANGED Viewed

@@ -32,12 +32,25 @@ python src/database.py
 ````
 Adjust the path to the data in the database.py file as needed.
-## Running the code
 To run the code, execute the following command:
 ````bash
 python main.py
 ````
-The code will prompt you to enter a natural language query.
 ### Example query
 ````angular2html
@@ -48,6 +61,7 @@ Lionel Messi has scored the following number of goals each season:
 - 2016-2017: 31 goals
 ````
 ## Results
 ![result-table.png](media%2Fresult-table.png)

 ````
 Adjust the path to the data in the database.py file as needed.
+## Running the code in command line
 To run the code, execute the following command:
 ````bash
+The code will prompt you to enter a natural language query.
 python main.py
 ````
+You can also call main_cli.py with a query as an argument:
+````bash
+python main_cli.py -q "How many goals has Messi scored each season?"
+````
+## Running the code in ChainLit (GUI)
+To run the code in ChainLit, execute the following command:
+````bash
+chainlit run app.py
+````
+This will open up a browser window with the GUI.
+![ChainLit](media/chainlit.png)
 ### Example query
 ````angular2html
 - 2016-2017: 31 goals
 ````
 ## Results
 ![result-table.png](media%2Fresult-table.png)

app.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import os
+from src.extractor import create_extractor
+from src.sql_chain import create_agent
+from dotenv import load_dotenv
+import chainlit as cl
+import json
+# Loading the environment variables
+load_dotenv(".env")
+# Create the extractor and agent
+model = os.getenv('OPENAI_MODEL')
+# Check if model exists, if not, set it to default
+# if not model:
+#     model = "gpt-3.5-turbo-0125"
+ex = create_extractor()
+ag = create_agent(llm_model=model)
+# ag = create_agent(llm_model = "gpt-4-0125-preview")
+openai_api_key = os.getenv('OPENAI_API_KEY')
+def extract_func(user_prompt: str):
+    """
+    Parameters
+    ----------
+    user_prompt: str
+    Returns
+    -------
+    A dictionary of extracted properties
+    """
+    extracted = ex.extract_chainlit(user_prompt)
+    return extracted
+def validate_func(properties:dict):  # Auto validate as much as possible
+    """
+    Parameters
+    ----------
+    extracted properties: dict
+    Returns
+    -------
+    Two dictionaries:
+    1. validated: The validated properties
+    2. need_input: Properties that need human validation
+    """
+    validated, need_input = ex.validate_chainlit(properties)
+    return validated, need_input
+def human_validate_func(human, validated, user_prompt):
+    """
+    Parameters
+    ----------
+    human - Human validated properties in the form of a list of dictionaries
+    validated - Validated properties in the form of a dictionary
+    user_prompt - The user prompt
+    Returns
+    -------
+    The cleaned prompt with updated values
+    """
+    for item in human:
+        # Iterate through key-value pairs in the current dictionary
+        for key, value in item.items():
+            if value == "":
+                continue
+            # Check if the key exists in the validated dictionary
+            if key in validated:
+                # Append the value to the existing list
+                validated[key].append(value)
+            else:
+                # Create a new key with the value as a new list
+                validated[key] = [value]
+    val_list = [validated]
+    return ex.build_prompt_chainlit(val_list, user_prompt)
+def no_human(validated, user_prompt):
+    """
+    In case there is no need for human validation, this function will be called
+    Parameters
+    ----------
+    validated
+    user_prompt
+    Returns
+    -------
+    Updated prompt
+    """
+    return ex.build_prompt_chainlit([validated], user_prompt)
+def ask(text):
+    """
+    Calls the SQL Agent to get the final answer
+    Parameters
+    ----------
+    text
+    Returns
+    -------
+    The final answer
+    """
+    ans, const = ag.ask(text)
+    return {"output": ans["output"]}, 12
+@cl.step
+async def Cleaner(text):  # just for printing
+    return text
+@cl.step
+async def LLM(cleaned_prompt):  # just for printing
+    ans, const = ask(cleaned_prompt)
+    return ans, const
+@cl.step
+async def Choice(text):
+    return text
+@cl.step
+async def Extractor(user_prompt):
+    extracted_values = extract_func(user_prompt)
+    return extracted_values
+@cl.on_message  # this function will be called every time a user inputs a message in the UI
+async def main(message: cl.Message):
+    user_prompt = message.content # Get the user prompt
+    # extracted_values = extract_func(user_prompt)
+    #
+    # json_formatted = json.dumps(extracted_values, indent=4)
+    extracted_values = await Extractor(user_prompt)
+    json_formatted = json.dumps(extracted_values, indent=4)
+    # Print the extracted values in json format
+    await cl.Message(author="Extractor", content=f"Extracted properties:\n```json\n{json_formatted}\n```").send()
+    # Try to validate everything
+    validated, need_input = validate_func(extracted_values)
+    await cl.Message(author="Validator", content=f"Extracted properties will now be validated against the database.").send()
+    if need_input:
+        # If we need validation, we will ask the user to select the correct value
+        for element in need_input:
+            key = next(iter(element))  # Get the first key in the dictionary
+            # Present user with options to choose from
+            actions = [
+                cl.Action(name=value, value=value, description=str(value))
+                for value in element['top_matches']
+            ]
+            actions.append(cl.Action(name="No Update", value="", description="No Update"))
+            # Add a "No Update" option
+            res = await cl.AskActionMessage(
+                author="Validator",
+                content=f"Select the correct value for {element[key]}",
+                actions=actions
+            ).send()
+            selected_value = res.get("value", "") if res else ""
+            element[key] = selected_value
+            element.pop("top_matches")
+            await Choice(selected_value)  # Logging choice
+        # Get the cleaned prompt
+        cleaned_prompt = human_validate_func(need_input, validated, user_prompt)
+    else:
+        cleaned_prompt = no_human(validated, user_prompt)
+    # Print the cleaned prompt
+    cleaner_message = cl.Message(author="Cleaner", content=f"New prompt is as follows:\n{cleaned_prompt}")
+    await cleaner_message.send()
+    # Call the SQL agent to get the final answer
+    # ans, const = ask(cleaned_prompt)  # Get the final answer from some function
+    await cl.Message(content=f"I will now query the database for information.").send()
+    ans, const = await LLM(cleaned_prompt)
+    await cl.Message(content=f"This is the final answer: \n\n{ans['output']}").send()

chainlit.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Welcome to Chainlit! 🚀🤖
+Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
+## Useful Links 🔗
+- **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
+- **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
+We can't wait to see what you create with Chainlit! Happy coding! 💻😊
+## Welcome screen
+To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.

extractor.py ADDED Viewed

	@@ -0,0 +1,560 @@

+from typing import Optional
+from langchain.chains import create_extraction_chain_pydantic
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains import create_extraction_chain
+from copy import deepcopy
+from langchain_openai import ChatOpenAI
+from langchain_community.utilities import SQLDatabase
+import os
+import difflib
+import ast
+import json
+import re
+from thefuzz import process
+# Set up logging
+import logging
+from dotenv import load_dotenv
+load_dotenv(".env")
+logging.basicConfig(level=logging.INFO)
+# Save the log to a file
+handler = logging.FileHandler('extractor.log')
+logger = logging.getLogger(__name__)
+os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
+# os.environ["ANTHROPIC_API_KEY"] = os.getenv('ANTHROPIC_API_KEY')
+if os.getenv('LANGSMITH'):
+    os.environ['LANGCHAIN_TRACING_V2'] = 'true'
+    os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
+    os.environ[
+        'LANGCHAIN_API_KEY'] = os.getenv("LANGSMITH_API_KEY")
+    os.environ['LANGCHAIN_PROJECT'] = os.getenv('LANGSMITH_PROJECT')
+db_uri = os.getenv('DATABASE_PATH')
+db_uri = f"sqlite:///{db_uri}"
+db = SQLDatabase.from_uri(db_uri)
+# from langchain_anthropic import ChatAnthropic
+class Extractor():
+    # llm = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0)
+    #gpt-3.5-turbo
+    def __init__(self, model="gpt-3.5-turbo-0125", schema_config=None, custom_extractor_prompt=None):
+        # model = "gpt-4-0125-preview"
+        if custom_extractor_prompt:
+            cust_promt = ChatPromptTemplate.from_template(custom_extractor_prompt)
+        self.llm = ChatOpenAI(model=model, temperature=0)
+        # self.llm = ChatAnthropic(model="claude-3-opus-20240229", temperature=0)
+        self.schema = schema_config or {}
+        self.chain = create_extraction_chain(self.schema, self.llm, prompt=cust_promt)
+    def extract(self, query):
+        return self.chain.invoke(query)
+class Retriever():
+    def __init__(self, db, config):
+        self.db = db
+        self.config = config
+        self.table = config.get('db_table')
+        self.column = config.get('db_column')
+        self.pk_column = config.get('pk_column')
+        self.numeric = config.get('numeric', False)
+        self.response = []
+        self.query = f"SELECT {self.column} FROM {self.table}"
+        self.augmented_table = config.get('augmented_table', None)
+        self.augmented_column = config.get('augmented_column', None)
+        self.augmented_fk = config.get('augmented_fk', None)
+    def query_as_list(self):
+        # Execute the query
+        response = self.db.run(self.query)
+        response = [el for sub in ast.literal_eval(response) for el in sub if el]
+        if not self.numeric:
+            response = [re.sub(r"\b\d+\b", "", string).strip() for string in response]
+        self.response = list(set(response))
+        # print(self.response)
+        return self.response
+    def get_augmented_items(self, prompt):
+        if self.augmented_table is None:
+            return None
+        else:
+            # Construct the query to search for the prompt in the augmented table
+            query = f"SELECT {self.augmented_fk} FROM {self.augmented_table} WHERE LOWER({self.augmented_column}) = LOWER('{prompt}')"
+            # Execute the query
+            fk_response = self.db.run(query)
+            if fk_response:
+                # Extract the FK value
+                fk_response = ast.literal_eval(fk_response)
+                fk_value = fk_response[0][0]
+                query = f"SELECT {self.column} FROM {self.table} WHERE {self.pk_column} = {fk_value}"
+                # Execute the query
+                matching_response = self.db.run(query)
+                # Extract the matching response
+                matching_response = ast.literal_eval(matching_response)
+                matching_response = matching_response[0][0]
+                return matching_response
+            else:
+                return None
+    def find_close_matches(self, target_string, n=3, method="difflib", threshold=70):
+        """
+        Find and return the top n close matches to target_string in the database query results.
+        Args:
+        - target_string (str): The string to match against the database results.
+        - n (int): Number of top matches to return.
+        Returns:
+        - list of tuples: Each tuple contains a match and its score.
+        """
+        # Ensure we have the response list populated
+        if not self.response:
+            self.query_as_list()
+        # Find top n close matches
+        if method == "fuzzy":
+            # Use the fuzzy_string method to get matches and their scores
+            # If the threshold is met, return the best match; otherwise, return all matches meeting the threshold
+            top_matches = self.fuzzy_string(target_string, limit=n, threshold=threshold)
+        else:
+            # Use difflib's get_close_matches to get the top n matches
+            top_matches = difflib.get_close_matches(target_string, self.response, n=n, cutoff=0.2)
+        return top_matches
+    def fuzzy_string(self, prompt, limit, threshold=80, low_threshold=30):
+        # Get matches and their scores, limited by the specified 'limit'
+        matches = process.extract(prompt, self.response, limit=limit)
+        filtered_matches = [match for match in matches if match[1] >= threshold]
+        # If no matches meet the threshold, return the list of all matches' strings
+        if not filtered_matches:
+            # Return matches above the low_threshold
+            # Fix for wrong properties being returned
+            return [match[0] for match in matches if match[1] >= low_threshold]
+        # If there's only one match meeting the threshold, return it as a string
+        if len(filtered_matches) == 1:
+            return filtered_matches[0][0]  # Return the matched string directly
+        # If there's more than one match meeting the threshold or ties, return the list of matches' strings
+        highest_score = filtered_matches[0][1]
+        ties = [match for match in filtered_matches if match[1] == highest_score]
+        # Return the strings of tied matches directly, ignoring the scores
+        m = [match[0] for match in ties]
+        if len(m) == 1:
+            return m[0]
+        return [match[0] for match in ties]
+    def fetch_pk(self, property_name, property_value):
+        # Some properties do not have a primary key
+        # Return the property value if no primary key is specified
+        pk_list = []
+        # Check if the property_value is a list; if not, make it a list for uniform processing
+        if not isinstance(property_value, list):
+            property_value = [property_value]
+        # Some properties do not have a primary key
+        # Return None for each property_value if no primary key is specified
+        if self.pk_column is None:
+            return [None for _ in property_value]
+        for value in property_value:
+            query = f"SELECT {self.pk_column} FROM {self.table} WHERE {self.column} = '{value}' LIMIT 1"
+            response = self.db.run(query)
+            # Append the response (PK or None) to the pk_list
+            pk_list.append(response)
+        return pk_list
+def setup_retrievers(db, schema_config):
+    # retrievers = {}
+    # for prop, config in schema_config["properties"].items():
+    #     retrievers[prop] = Retriever(db=db, config=config)
+    # return retrievers
+    retrievers = {}
+    # Iterate over each property in the schema_config's properties
+    for prop, config in schema_config["properties"].items():
+        # Access the 'items' dictionary for the configuration of the array's elements
+        item_config = config['items']
+        # Create a Retriever instance using the item_config
+        retrievers[prop] = Retriever(db=db, config=item_config)
+    return retrievers
+def extract_properties(prompt, schema_config, custom_extractor_prompt=None):
+    """Extract properties from the prompt."""
+    # modify schema_conf to only include the required properties
+    schema_stripped = {'properties': {}}
+    for key, value in schema_config['properties'].items():
+        schema_stripped['properties'][key] = {
+            'type': value['type'],
+            'items': {'type': value['items']['type']}
+        }
+    extractor = Extractor(schema_config=schema_stripped, custom_extractor_prompt=custom_extractor_prompt)
+    extraction_result = extractor.extract(prompt)
+    # print("Extraction Result:", extraction_result)
+    if 'text' in extraction_result and extraction_result['text']:
+        properties = extraction_result['text']
+        return properties
+    else:
+        print("No properties extracted.")
+        return None
+def recheck_property_value(properties, property_name, retrievers, input_func):
+    while True:
+        new_value = input_func(f"Enter new value for {property_name} or type 'quit' to stop: ")
+        if new_value.lower() == 'quit':
+            break  # Exit the loop and do not update the property
+        new_top_matches = retrievers[property_name].find_close_matches(new_value, n=3)
+        if new_top_matches:
+            # Display new top matches and ask for confirmation or re-entry
+            print("\nNew close matches found:")
+            for i, match in enumerate(new_top_matches, start=1):
+                print(f"[{i}] {match}")
+            print("[4] Re-enter value")
+            print("[5] Quit without updating")
+            selection = input_func("Select the best match (1-3), choose 4 to re-enter value, or 5 to quit: ")
+            if selection in ['1', '2', '3']:
+                selected_match = new_top_matches[int(selection) - 1]
+                properties[property_name] = selected_match  # Update the dictionary directly
+                print(f"Updated {property_name} to {selected_match}")
+                break  # Successfully updated, exit the loop
+            elif selection == '5':
+                break  # Quit without updating
+            # Loop will continue if user selects 4 or inputs invalid selection
+        else:
+            print("No close matches found. Please try again or type 'quit' to stop.")
+def check_and_update_properties(properties_list, retrievers, method="fuzzy", input_func=input):
+    """
+    Checks and updates the properties in the properties list based on close matches found in the database.
+    The function iterates through each property in each property dictionary within the list,
+    finds close matches for it in the database using the retrievers, and updates the property
+    value based on user selection.
+    Args:
+        properties_list (list of dict): A list of dictionaries, where each dictionary contains properties
+            to check and potentially update based on database matches.
+        retrievers (dict): A dictionary of Retriever objects keyed by property name, used to find close matches in the database.
+        input_func (function, optional): A function to capture user input. Defaults to the built-in input function.
+    The function updates the properties_list in place based on user choices for updating property values
+    with close matches found by the retrievers.
+    """
+    for index, properties in enumerate(properties_list):
+        for property_name, retriever in retrievers.items():  # Iterate using items to get both key and value
+            property_values = properties.get(property_name, [])
+            if not property_values:  # Skip if the property is not present or is an empty list
+                continue
+            updated_property_values = []  # To store updated list of values
+            for value in property_values:
+                if retriever.augmented_table:
+                    augmented_value = retriever.get_augmented_items(value)
+                    if augmented_value:
+                        updated_property_values.append(augmented_value)
+                        continue
+                # Since property_value is now expected to be a list, we handle each value individually
+                top_matches = retriever.find_close_matches(value, method=method, n=3)
+                # Check if the closest match is the same as the current value
+                if top_matches and top_matches[0] == value:
+                    updated_property_values.append(value)
+                    continue
+                if not top_matches:
+                    updated_property_values.append(value)  # Keep the original value if no matches found
+                    continue
+                if type(top_matches) == str and method == "fuzzy":
+                    # If the top_matches is a string, it means that the threshold was met and only one item was returned
+                    # In this case, we can directly update the property with the top match
+                    updated_property_values.append(top_matches)
+                    properties[property_name] = updated_property_values
+                    continue
+                print(f"\nCurrent {property_name}: {value}")
+                for i, match in enumerate(top_matches, start=1):
+                    print(f"[{i}] {match}")
+                print("[4] Enter new value")
+                # hmm = input_func(f"Fix for Pycharm, press enter to continue")
+                choice = input_func(f"Select the best match for {property_name} (1-4): ")
+                if choice in ['1', '2', '3']:
+                    selected_match = top_matches[int(choice) - 1]
+                    updated_property_values.append(selected_match)  # Update with the selected match
+                    print(f"Updated {property_name} to {selected_match}")
+                elif choice == '4':
+                    # Allow re-entry of value for this specific item
+                    recheck_property_value(properties, property_name, value, retrievers, input_func)
+                    # Note: Implement recheck_property_value to handle individual value updates within the list
+                else:
+                    print("Invalid selection. Property not updated.")
+                    updated_property_values.append(value)  # Keep the original value
+            # Update the entire list for the property after processing all values
+            properties[property_name] = updated_property_values
+# Function to remove duplicates
+def remove_duplicates(dicts):
+    seen = {}  # Dictionary to keep track of seen values for each key
+    for d in dicts:
+        for key in list(d.keys()):  # Use list to avoid RuntimeError for changing dict size during iteration
+            value = d[key]
+            if key in seen and value == seen[key]:
+                del d[key]  # Remove key-value pair if duplicate is found
+            else:
+                seen[key] = value  # Update seen values for this key
+    return dicts
+def fetch_pks(properties_list, retrievers):
+    all_pk_attributes = []  # Initialize a list to store dictionaries of _pk attributes for each item in properties_list
+    # Iterate through each properties dictionary in the list
+    for properties in properties_list:
+        pk_attributes = {}  # Initialize a dictionary for the current set of properties
+        for property_name, property_value in properties.items():
+            if property_name in retrievers:
+                # Fetch the primary key using the retriever for the current property
+                pk = retrievers[property_name].fetch_pk(property_name, property_value)
+                # Store it in the dictionary with a modified key name
+                pk_attributes[f"{property_name}_pk"] = pk
+        # Add the dictionary of _pk attributes for the current set of properties to the list
+        all_pk_attributes.append(pk_attributes)
+    # Return a list of dictionaries, where each dictionary contains _pk attributes for a set of properties
+    return all_pk_attributes
+def update_prompt(prompt, properties, pk, properties_original):
+    # Replace the original prompt with the updated properties and pk
+    prompt = prompt.replace("{{properties}}", str(properties))
+    prompt = prompt.replace("{{pk}}", str(pk))
+    return prompt
+def update_prompt_enhanced(prompt, properties, pk, properties_original):
+    updated_info = ""
+    for prop, pk_info, prop_orig in zip(properties, pk, properties_original):
+        for key in prop.keys():
+            # Extract original and updated values
+            orig_values = prop_orig.get(key, [])
+            updated_values = prop.get(key, [])
+            # Ensure both original and updated values are lists for uniform processing
+            if not isinstance(orig_values, list):
+                orig_values = [orig_values]
+            if not isinstance(updated_values, list):
+                updated_values = [updated_values]
+            # Extract primary key detail for this key, handling various pk formats carefully
+            pk_key = f"{key}_pk"  # Construct pk key name based on the property key
+            pk_details = pk_info.get(pk_key, [])
+            if not isinstance(pk_details, list):
+                pk_details = [pk_details]
+            for orig_value, updated_value, pk_detail in zip(orig_values, updated_values, pk_details):
+                pk_value = None
+                if isinstance(pk_detail, str):
+                    pk_value = pk_detail.strip("[]()").split(",")[0].replace("'", "").replace('"', '')
+                update_statement = ""
+                # Skip updating if there's no change in value to avoid redundant info
+                if orig_value != updated_value and pk_value:
+                    update_statement = f"\n- {orig_value} (now referred to as {updated_value}) has a primary key: {pk_value}."
+                elif orig_value != updated_value:
+                    update_statement = f"\n- {orig_value} (now referred to as {updated_value})."
+                elif pk_value:
+                    update_statement = f"\n- {orig_value} has a primary key: {pk_value}."
+                updated_info += update_statement
+    if updated_info:
+        prompt += "\nUpdated Information:" + updated_info
+    return prompt
+def prompt_cleaner(prompt, db, schema_config):
+    """Main function to clean the prompt."""
+    retrievers = setup_retrievers(db, schema_config)
+    properties = extract_properties(prompt, schema_config)
+    # Keep original properties for later use
+    properties_original = deepcopy(properties)
+    # Remove duplicates - Happens when there are more than one player or team in the prompt
+    properties = remove_duplicates(properties)
+    if properties:
+        check_and_update_properties(properties, retrievers)
+        pk = fetch_pks(properties, retrievers)
+    properties = update_prompt_enhanced(prompt, properties, pk, properties_original)
+    return properties, pk
+class PromptCleaner:
+    """
+    A class designed to clean and process prompts by extracting properties, removing duplicates,
+    and updating these properties based on a predefined schema configuration and database interactions.
+    Attributes:
+        db: A database connection object used to execute queries and fetch data.
+        schema_config: A dictionary defining the schema configuration for the extraction process.
+        schema_config = {
+            "properties": {
+                # Property name
+                "person_name": {"type": "string", "db_table": "players", "db_column": "name", "pk_column": "hash",
+                                    # if mostly numeric, such as 2015-2016 set true
+                                "numeric": False},
+                "team_name": {"type": "string", "db_table": "teams", "db_column": "name", "pk_column": "id",
+                              "numeric": False},
+                              # Add more as needed
+            },
+            # Parameter to extractor, if person_name is required, add it here and the extractor will
+            # return an error if it is not found
+            "required": [],
+        }
+    Methods:
+        clean(prompt): Cleans the given prompt by extracting and updating properties based on the database.
+            Returns a tuple containing the updated properties and their primary keys.
+    """
+    def __init__(self, db=db, schema_config=None, custom_extractor_prompt=None):
+        """
+        Initializes the PromptCleaner with a database connection and a schema configuration.
+        Args:
+            db: The database connection object to be used for querying. (if none, it will use the default db)
+            schema_config: A dictionary defining properties and their database mappings for extraction and updating.
+        """
+        self.db = db
+        self.schema_config = schema_config
+        self.retrievers = setup_retrievers(self.db, self.schema_config)
+        self.cust_extractor_prompt = custom_extractor_prompt
+    def clean(self, prompt, return_pk=False, test=False, verbose = False):
+        """
+        Processes the given prompt to extract properties, remove duplicates, update the properties
+        based on close matches within the database, and fetch primary keys for these properties.
+        The method first extracts properties from the prompt using the schema configuration,
+        then checks these properties against the database to find and update close matches.
+        It also fetches primary keys for the updated properties where applicable.
+        Args:
+            prompt (str): The prompt text to be cleaned and processed.
+            return_pk (bool): A flag to indicate whether to return primary keys along with the properties.
+            test (bool): A flag to indicate whether to return the original properties for testing purposes.
+            verbose (bool): A flag to indicate whether to return the original properties for debugging.
+        Returns:
+            tuple: A tuple containing two elements:
+                - The first element is the original prompt, with updated information that excist in the db.
+                - The second element is a list of dictionaries, each containing primary keys for the properties,
+                  where applicable.
+        """
+        if self.cust_extractor_prompt:
+            properties = extract_properties(prompt, self.schema_config, self.cust_extractor_prompt)
+        else:
+            properties = extract_properties(prompt, self.schema_config)
+        # Keep original properties for later use
+        properties_original = deepcopy(properties)
+        if test:
+            return properties_original
+        # Remove duplicates - Happens when there are more than one player or team in the prompt
+        # properties = remove_duplicates(properties)
+        pk = None
+        if properties:
+            check_and_update_properties(properties, self.retrievers)
+            pk = fetch_pks(properties, self.retrievers)
+        properties = update_prompt_enhanced(prompt, properties, pk, properties_original)
+        if return_pk:
+            return properties, pk
+        elif verbose:
+            return properties, properties_original
+        else:
+            return properties
+def load_json(file_path: str) -> dict:
+    with open(file_path, 'r') as file:
+        return json.load(file)
+def create_extractor(schema: str = "src/conf/schema.json", db: SQLDatabase = db_uri):
+    schema_config = load_json(schema)
+    db = SQLDatabase.from_uri(db)
+    pre_prompt = """Extract and save the relevant entities mentioned \
+                    in the following passage together with their properties.
+                    Only extract the properties mentioned in the 'information_extraction' function.
+                    The questions are soccer related. game_event are things like yellow cards, goals, assists, freekick ect.
+                    Generic properties like, "description", "home team", "away team", "game" ect should NOT be extracted.
+                    If a property is not present and is not required in the function parameters, do not include it in the output.
+                    If no properties are found, return an empty list.
+                    Here are some exampels:
+                    'How many goals did Henry score for Arsnl in the 2015 season?'
+                    person_name': ['Henry'], 'team_name': [Arsnl],'year_season': ['2015'],
+                    Passage:
+                    {input}
+    """
+    return PromptCleaner(db, schema_config, custom_extractor_prompt=pre_prompt)
+if __name__ == "__main__":
+    schema_config = load_json("src/conf/schema.json")
+    # Add game and league to the schema_config
+    # prompter = PromptCleaner(db, schema_config, custom_extractor_prompt=extract_prompt)
+    prompter = create_extractor("src/conf/schema.json", "sqlite:///data/games.db")
+    prompt= prompter.clean("Give me goals, shots on target, shots off target and corners from the game between ManU and Swansa")
+    print(prompt)

main.py CHANGED Viewed

@@ -1,8 +1,15 @@
 from src.extractor import create_extractor
 from src.sql_chain import create_agent
 ex = create_extractor()
-ag = create_agent(llm_model="gpt-3.5-turbo-0125", verbose=False)
-# ag = create_agent(llm_model = "gpt-4-0125-preview")
 def query(prompt):
     clean = ex.clean(prompt)

 from src.extractor import create_extractor
 from src.sql_chain import create_agent
+import os
+from dotenv import load_dotenv
+ex = create_extractor()
+load_dotenv(".env")
+model = os.getenv('OPENAI_MODEL')
 ex = create_extractor()
+ag = create_agent(llm_model=model)
 def query(prompt):
     clean = ex.clean(prompt)

main_cli.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from src.extractor import create_extractor
+from src.sql_chain import create_agent
+import os
+from dotenv import load_dotenv
+load_dotenv(".env")
+model = os.getenv('OPENAI_MODEL')
+ex = create_extractor()
+ag = create_agent(llm_model=model)
+def query(prompt):
+    clean, ver = ex.clean(prompt, verbose=True)
+    ans, ver = ag.ask(clean)
+    return ans
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description="Process a user query.")
+    parser.add_argument('-q', '--query', type=str, required=True, help='A query string to process')
+    args = parser.parse_args()
+    ans = query(args.query)
+    print(ans["output"])

media/chainlit.png ADDED Viewed

requirements.txt CHANGED Viewed

@@ -10,5 +10,7 @@ rapidfuzz==3.6.1
 thefuzz==0.22.1
 faiss-cpu
 Levenshtein==0.25.0
-langsmith~=0.1.54
 python-dotenv==1.0.1

 thefuzz==0.22.1
 faiss-cpu
 Levenshtein==0.25.0
+langsmith~=0.0.92
 python-dotenv==1.0.1
+chainlit~=1.0.506
+pandas

src/database.py CHANGED Viewed

@@ -4,7 +4,7 @@ import pandas as pd
 import os
 import json
-engine = create_engine('sqlite:///../../data/games.db', echo=False)
 Base = declarative_base()
@@ -25,6 +25,7 @@ class Game(Base):
     season = Column(String)
     league_id = Column(Integer, ForeignKey('leagues.id'))
 class GameLineup(Base):
     __tablename__ = 'game_lineup'
     id = Column(Integer, primary_key=True)
@@ -46,6 +47,7 @@ class Team(Base):
     id = Column(Integer, primary_key=True)
     name = Column(String)
 class Player(Base):
     __tablename__ = 'players'
     hash = Column(String, primary_key=True)
@@ -75,11 +77,13 @@ class Commentary(Base):
     event_time_end = Column(Float)
     description = Column(Text)
 class League(Base):
     __tablename__ = 'leagues'
     id = Column(Integer, primary_key=True)
     name = Column(String)
 class Event(Base):
     __tablename__ = 'events'
     id = Column(Integer, primary_key=True)
@@ -92,36 +96,36 @@ class Event(Base):
     label = Column(String)
     visibility = Column(Boolean)
 class Augmented_Team(Base):
     __tablename__ = 'augmented_teams'
     id = Column(Integer, primary_key=True)
     team_id = Column(Integer, ForeignKey('teams.id'))
     augmented_name = Column(String)
 class Augmented_League(Base):
     __tablename__ = 'augmented_leagues'
     id = Column(Integer, primary_key=True)
     league_id = Column(Integer, ForeignKey('leagues.id'))
     augmented_name = Column(String)
 class Player_Event_Label(Base):
     __tablename__ = 'player_event_labels'
     id = Column(Integer, primary_key=True)
     label = Column(String)
 class Player_Event(Base):
     __tablename__ = 'player_events'
     id = Column(Integer, primary_key=True)
     game_id = Column(Integer, ForeignKey('games.id'))
     player_id = Column(Integer, ForeignKey('players.hash'))
-    time = Column(String) # Time in minutes of the game
     type = Column(Integer, ForeignKey('player_event_labels.id'))
-    linked_player = Column(Integer, ForeignKey('players.hash')) # If the event is linked to another player, for example a substitution
 # Create Tables
@@ -130,11 +134,13 @@ Base.metadata.create_all(engine)
 # Session setup
 Session = sessionmaker(bind=engine)
-def extract_time_from_player_event(time:str)->str:
     # Extract the time from the string
-    time = time.split("'")[0] # Need to keep it str because of overtime eg. (45+2)
     return time
 def get_or_create(session, model, **kwargs):
     instance = session.query(model).filter_by(**kwargs).first()
     if instance:
@@ -145,7 +151,8 @@ def get_or_create(session, model, **kwargs):
         session.commit()
         return instance
-def process_game_data(data,data2, league, season):
     session = Session()
     # Caption = d and v2 = d2
     home_team = data["gameHomeTeam"]
@@ -169,7 +176,8 @@ def process_game_data(data,data2, league, season):
     # Check if league exists
     league = get_or_create(session, League, name=league)
     if not game:
-        game = Game(timestamp=timestamp, score=score, goal_home=home_score, goal_away=away_score, round=round_, home_team_id=home_team.id, away_team_id=away_team.id,
                     venue=venue, date=date, attendance=attendance, season=season, league_id=league.id, referee=referee)
         session.add(game)
         session.commit()
@@ -187,22 +195,19 @@ def process_game_data(data,data2, league, season):
         for player_data in team_lineup["players"]:
             player_hash = player_data["hash"]
             name = player_data["long_name"]
-            if " " not in name: # Since some players are missing their first name, do this to help with the search
                 name = "NULL " + name
             number = player_data["shirt_number"]
             captain = player_data["captain"] == "(C)"
             starting = player_data["starting"]
             country = player_data["country"]
             position = player_data["lineup"]
-            facts = player_data.get("facts", None) # Facts might be empty
             player = get_or_create(session, Player, hash=player_hash, name=name, country=country)
             game_lineup = GameLineup(game_id=game.id, team_id=team_id, player_id=player.hash,
-                                     shirt_number=number, position=position, starting=starting, captain=captain, coach=False, tactics=tactic)
             if facts:
                 for fact in facts:
                     type = fact["type"]
@@ -210,7 +215,8 @@ def process_game_data(data,data2, league, season):
                     event = get_or_create(session, Player_Event_Label, id=int(type))
                     linked_player = fact.get("linked_player_hash", None)
-                    player_event = Player_Event(game_id=game.id, player_id=player.hash, time=time, type=event.id, linked_player=linked_player)
                     session.add(player_event)
             session.add(game_lineup)
@@ -223,7 +229,8 @@ def process_game_data(data,data2, league, season):
         coach_country = coach["country"]
         coach_player = get_or_create(session, Player, hash=coach_hash, name=coach_name, country=coach_country)
         game_lineup = GameLineup(game_id=game.id, team_id=team_id, player_id=coach_player.hash,
-                                 shirt_number=None, position=None, starting=None, captain=False, coach=True, tactics=tactic)
         session.add(game_lineup)
         # Commit all changes at once
@@ -241,7 +248,7 @@ def process_game_data(data,data2, league, season):
             label = "yellow card"
         elif label == "r-card":
             label = "red card"
         description = event["description"]
         important = event["important"] == "true"
         visible = event["visibility"]
@@ -257,9 +264,11 @@ def process_game_data(data,data2, league, season):
     return game.id, home_team.id, away_team.id
 def process_player_data(data):
     pass
 def process_ASR_data(data, game_id, period):
     session = Session()
     seg = data["segments"]
@@ -277,6 +286,7 @@ def process_ASR_data(data, game_id, period):
     session.commit()
     session.close()
 def convert_to_seconds(time_str):
     # Split the string into its components
     period, time = time_str.split(" - ")
@@ -321,17 +331,14 @@ def parse_labels_v2(data, session, home_team_id, away_team_id, game_id):
             game_time=game_time,  # Already in seconds
             frame_stamp=position,  # Make sure this is an integer or None
             team_id=team_id,  # Integer ID of the team
-            visibility=visibility, # Boolean
-            label=label # String with information
         )
         session.add(annotation_entry)
     session.commit()
 def process_json_files(directory):
     session = Session()
     fill_player_events(session)
@@ -355,7 +362,7 @@ def process_json_files(directory):
                 lb_cap = json.load(f)
             with open(os.path.join(root, "Labels-v2.json"), 'r') as f:
                 lb_v2 = json.load(f)
-            game_id, home_team_id, away_team_id = process_game_data(lb_cap,lb_v2, league, season)
         for file in asr_files:
             with open(os.path.join(root, file), 'r') as f:
@@ -368,19 +375,18 @@ def process_json_files(directory):
             elif '1_half-ASR' in file:
                 period = 1
                 # Parse and commit the data
-                process_ASR_data(data=asr, game_id = game_id, period=period)
             elif '2_half-ASR' in file:
                 period = 2
                 # Parse and commit the data
-                process_ASR_data(data=asr, game_id = game_id, period=period)
     session.commit()
     session.close()
-def fill_player_events(session):
     fact_id2label = {
         "1": "Yellow card",
         # Example: "time": "71' Ivanovic B. (Unsportsmanlike conduct)", "description": "Yellow Card"
@@ -397,9 +403,7 @@ def fill_player_events(session):
     session.commit()
 def fill_Augmented_Team(file_path):
     df = pd.read_csv(file_path)
     # the df should have two columns, team_name and augmented_name
@@ -417,6 +421,7 @@ def fill_Augmented_Team(file_path):
     session.commit()
     session.close()
 def fill_Augmented_League(file_path):
     # Read the csv file
     df = pd.read_csv(file_path)
@@ -432,14 +437,16 @@ def fill_Augmented_League(file_path):
         augmented_name = augmented_name.strip()
         league = session.query(League).filter_by(name=league_name).first()
         if league:
-            augmented_league = get_or_create(session, Augmented_League, league_id=league.id, augmented_name=augmented_name)
     session.commit()
     session.close()
 if __name__ == "__main__":
     # Example directory path
-    process_json_files('../data/Dataset/SoccerNet/')
-    fill_Augmented_Team('../data/dataset/augmented_teams.csv')
-    fill_Augmented_League('../data/dataset/augmented_leagues.csv')
 # Rename the event/annotation table to something more descriptive. Events are fucking everything else over

 import os
 import json
+engine = create_engine('sqlite:///../data/games.db', echo=False)
 Base = declarative_base()
     season = Column(String)
     league_id = Column(Integer, ForeignKey('leagues.id'))
 class GameLineup(Base):
     __tablename__ = 'game_lineup'
     id = Column(Integer, primary_key=True)
     id = Column(Integer, primary_key=True)
     name = Column(String)
 class Player(Base):
     __tablename__ = 'players'
     hash = Column(String, primary_key=True)
     event_time_end = Column(Float)
     description = Column(Text)
 class League(Base):
     __tablename__ = 'leagues'
     id = Column(Integer, primary_key=True)
     name = Column(String)
 class Event(Base):
     __tablename__ = 'events'
     id = Column(Integer, primary_key=True)
     label = Column(String)
     visibility = Column(Boolean)
 class Augmented_Team(Base):
     __tablename__ = 'augmented_teams'
     id = Column(Integer, primary_key=True)
     team_id = Column(Integer, ForeignKey('teams.id'))
     augmented_name = Column(String)
 class Augmented_League(Base):
     __tablename__ = 'augmented_leagues'
     id = Column(Integer, primary_key=True)
     league_id = Column(Integer, ForeignKey('leagues.id'))
     augmented_name = Column(String)
 class Player_Event_Label(Base):
     __tablename__ = 'player_event_labels'
     id = Column(Integer, primary_key=True)
     label = Column(String)
 class Player_Event(Base):
     __tablename__ = 'player_events'
     id = Column(Integer, primary_key=True)
     game_id = Column(Integer, ForeignKey('games.id'))
     player_id = Column(Integer, ForeignKey('players.hash'))
+    time = Column(String)  # Time in minutes of the game
     type = Column(Integer, ForeignKey('player_event_labels.id'))
+    linked_player = Column(Integer, ForeignKey(
+        'players.hash'))  # If the event is linked to another player, for example a substitution
 # Create Tables
 # Session setup
 Session = sessionmaker(bind=engine)
+def extract_time_from_player_event(time: str) -> str:
     # Extract the time from the string
+    time = time.split("'")[0]  # Need to keep it str because of overtime eg. (45+2)
     return time
 def get_or_create(session, model, **kwargs):
     instance = session.query(model).filter_by(**kwargs).first()
     if instance:
         session.commit()
         return instance
+def process_game_data(data, data2, league, season):
     session = Session()
     # Caption = d and v2 = d2
     home_team = data["gameHomeTeam"]
     # Check if league exists
     league = get_or_create(session, League, name=league)
     if not game:
+        game = Game(timestamp=timestamp, score=score, goal_home=home_score, goal_away=away_score, round=round_,
+                    home_team_id=home_team.id, away_team_id=away_team.id,
                     venue=venue, date=date, attendance=attendance, season=season, league_id=league.id, referee=referee)
         session.add(game)
         session.commit()
         for player_data in team_lineup["players"]:
             player_hash = player_data["hash"]
             name = player_data["long_name"]
+            if " " not in name:  # Since some players are missing their first name, do this to help with the search
                 name = "NULL " + name
             number = player_data["shirt_number"]
             captain = player_data["captain"] == "(C)"
             starting = player_data["starting"]
             country = player_data["country"]
             position = player_data["lineup"]
+            facts = player_data.get("facts", None)  # Facts might be empty
             player = get_or_create(session, Player, hash=player_hash, name=name, country=country)
             game_lineup = GameLineup(game_id=game.id, team_id=team_id, player_id=player.hash,
+                                     shirt_number=number, position=position, starting=starting, captain=captain,
+                                     coach=False, tactics=tactic)
             if facts:
                 for fact in facts:
                     type = fact["type"]
                     event = get_or_create(session, Player_Event_Label, id=int(type))
                     linked_player = fact.get("linked_player_hash", None)
+                    player_event = Player_Event(game_id=game.id, player_id=player.hash, time=time, type=event.id,
+                                                linked_player=linked_player)
                     session.add(player_event)
             session.add(game_lineup)
         coach_country = coach["country"]
         coach_player = get_or_create(session, Player, hash=coach_hash, name=coach_name, country=coach_country)
         game_lineup = GameLineup(game_id=game.id, team_id=team_id, player_id=coach_player.hash,
+                                 shirt_number=None, position=None, starting=None, captain=False, coach=True,
+                                 tactics=tactic)
         session.add(game_lineup)
         # Commit all changes at once
             label = "yellow card"
         elif label == "r-card":
             label = "red card"
         description = event["description"]
         important = event["important"] == "true"
         visible = event["visibility"]
     return game.id, home_team.id, away_team.id
 def process_player_data(data):
     pass
 def process_ASR_data(data, game_id, period):
     session = Session()
     seg = data["segments"]
     session.commit()
     session.close()
 def convert_to_seconds(time_str):
     # Split the string into its components
     period, time = time_str.split(" - ")
             game_time=game_time,  # Already in seconds
             frame_stamp=position,  # Make sure this is an integer or None
             team_id=team_id,  # Integer ID of the team
+            visibility=visibility,  # Boolean
+            label=label  # String with information
         )
         session.add(annotation_entry)
     session.commit()
 def process_json_files(directory):
     session = Session()
     fill_player_events(session)
                 lb_cap = json.load(f)
             with open(os.path.join(root, "Labels-v2.json"), 'r') as f:
                 lb_v2 = json.load(f)
+            game_id, home_team_id, away_team_id = process_game_data(lb_cap, lb_v2, league, season)
         for file in asr_files:
             with open(os.path.join(root, file), 'r') as f:
             elif '1_half-ASR' in file:
                 period = 1
                 # Parse and commit the data
+                process_ASR_data(data=asr, game_id=game_id, period=period)
             elif '2_half-ASR' in file:
                 period = 2
                 # Parse and commit the data
+                process_ASR_data(data=asr, game_id=game_id, period=period)
     session.commit()
     session.close()
+def fill_player_events(session):
     fact_id2label = {
         "1": "Yellow card",
         # Example: "time": "71' Ivanovic B. (Unsportsmanlike conduct)", "description": "Yellow Card"
     session.commit()
 def fill_Augmented_Team(file_path):
     df = pd.read_csv(file_path)
     # the df should have two columns, team_name and augmented_name
     session.commit()
     session.close()
 def fill_Augmented_League(file_path):
     # Read the csv file
     df = pd.read_csv(file_path)
         augmented_name = augmented_name.strip()
         league = session.query(League).filter_by(name=league_name).first()
         if league:
+            augmented_league = get_or_create(session, Augmented_League, league_id=league.id,
+                                             augmented_name=augmented_name)
     session.commit()
     session.close()
 if __name__ == "__main__":
     # Example directory path
+    process_json_files('../data/Dataset/SN-ASR_captions_and_actions/')
+    fill_Augmented_Team('../data/Dataset/augmented_teams.csv')
+    fill_Augmented_League('../data/Dataset/augmented_leagues.csv')
 # Rename the event/annotation table to something more descriptive. Events are fucking everything else over

src/extractor.py CHANGED Viewed

@@ -32,13 +32,16 @@ if os.getenv('LANGSMITH'):
     os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
     os.environ[
         'LANGCHAIN_API_KEY'] = os.getenv("LANGSMITH_API_KEY")
-    os.environ['LANGCHAIN_PROJECT'] = 'master-theses'
-db = SQLDatabase.from_uri("sqlite:///data/games.db")
 # from langchain_anthropic import ChatAnthropic
 class Extractor():
     # llm = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0)
-    #gpt-3.5-turbo
     def __init__(self, model="gpt-3.5-turbo-0125", schema_config=None, custom_extractor_prompt=None):
         # model = "gpt-4-0125-preview"
         if custom_extractor_prompt:
@@ -133,7 +136,6 @@ class Retriever():
         # Get matches and their scores, limited by the specified 'limit'
         matches = process.extract(prompt, self.response, limit=limit)
         filtered_matches = [match for match in matches if match[1] >= threshold]
         # If no matches meet the threshold, return the list of all matches' strings
@@ -142,7 +144,6 @@ class Retriever():
             # Fix for wrong properties being returned
             return [match[0] for match in matches if match[1] >= low_threshold]
         # If there's only one match meeting the threshold, return it as a string
         if len(filtered_matches) == 1:
             return filtered_matches[0][0]  # Return the matched string directly
@@ -247,7 +248,7 @@ def recheck_property_value(properties, property_name, retrievers, input_func):
             print("No close matches found. Please try again or type 'quit' to stop.")
-def check_and_update_properties(properties_list, retrievers, method="fuzzy", input_func=input):
     """
     Checks and updates the properties in the properties list based on close matches found in the database.
     The function iterates through each property in each property dictionary within the list,
@@ -263,7 +264,7 @@ def check_and_update_properties(properties_list, retrievers, method="fuzzy", inp
     The function updates the properties_list in place based on user choices for updating property values
     with close matches found by the retrievers.
     """
     for index, properties in enumerate(properties_list):
         for property_name, retriever in retrievers.items():  # Iterate using items to get both key and value
             property_values = properties.get(property_name, [])
@@ -279,7 +280,11 @@ def check_and_update_properties(properties_list, retrievers, method="fuzzy", inp
                         updated_property_values.append(augmented_value)
                         continue
                 # Since property_value is now expected to be a list, we handle each value individually
-                top_matches = retriever.find_close_matches(value, method=method, n=3)
                 # Check if the closest match is the same as the current value
                 if top_matches and top_matches[0] == value:
@@ -296,30 +301,38 @@ def check_and_update_properties(properties_list, retrievers, method="fuzzy", inp
                     updated_property_values.append(top_matches)
                     properties[property_name] = updated_property_values
                     continue
-                print(f"\nCurrent {property_name}: {value}")
-                for i, match in enumerate(top_matches, start=1):
-                    print(f"[{i}] {match}")
-                print("[4] Enter new value")
-                # hmm = input_func(f"Fix for Pycharm, press enter to continue")
-                choice = input_func(f"Select the best match for {property_name} (1-4): ")
-                if choice in ['1', '2', '3']:
-                    selected_match = top_matches[int(choice) - 1]
-                    updated_property_values.append(selected_match)  # Update with the selected match
-                    print(f"Updated {property_name} to {selected_match}")
-                elif choice == '4':
-                    # Allow re-entry of value for this specific item
-                    recheck_property_value(properties, property_name, value, retrievers, input_func)
-                    # Note: Implement recheck_property_value to handle individual value updates within the list
-                else:
-                    print("Invalid selection. Property not updated.")
-                    updated_property_values.append(value)  # Keep the original value
             # Update the entire list for the property after processing all values
             properties[property_name] = updated_property_values
 # Function to remove duplicates
 def remove_duplicates(dicts):
@@ -354,18 +367,21 @@ def fetch_pks(properties_list, retrievers):
     return all_pk_attributes
-def update_prompt(prompt, properties, pk, properties_original):
-    # Replace the original prompt with the updated properties and pk
-    prompt = prompt.replace("{{properties}}", str(properties))
-    prompt = prompt.replace("{{pk}}", str(pk))
-    return prompt
-def update_prompt_enhanced(prompt, properties, pk, properties_original):
     updated_info = ""
     for prop, pk_info, prop_orig in zip(properties, pk, properties_original):
         for key in prop.keys():
             # Extract original and updated values
             orig_values = prop_orig.get(key, [])
             updated_values = prop.get(key, [])
@@ -391,9 +407,13 @@ def update_prompt_enhanced(prompt, properties, pk, properties_original):
                 if orig_value != updated_value and pk_value:
                     update_statement = f"\n- {orig_value} (now referred to as {updated_value}) has a primary key: {pk_value}."
                 elif orig_value != updated_value:
-                    update_statement = f"\n- {orig_value} (now referred to as {updated_value})."
                 elif pk_value:
                     update_statement = f"\n- {orig_value} has a primary key: {pk_value}."
                 updated_info += update_statement
@@ -417,7 +437,7 @@ def prompt_cleaner(prompt, db, schema_config):
         check_and_update_properties(properties, retrievers)
         pk = fetch_pks(properties, retrievers)
-    properties = update_prompt_enhanced(prompt, properties, pk, properties_original)
     return properties, pk
@@ -462,8 +482,9 @@ class PromptCleaner:
         self.schema_config = schema_config
         self.retrievers = setup_retrievers(self.db, self.schema_config)
         self.cust_extractor_prompt = custom_extractor_prompt
-    def clean(self, prompt, return_pk=False, test=False, verbose = False):
         """
         Processes the given prompt to extract properties, remove duplicates, update the properties
         based on close matches within the database, and fetch primary keys for these properties.
@@ -493,24 +514,50 @@ class PromptCleaner:
             properties = extract_properties(prompt, self.schema_config)
         # Keep original properties for later use
         properties_original = deepcopy(properties)
         if test:
             return properties_original
         # Remove duplicates - Happens when there are more than one player or team in the prompt
         # properties = remove_duplicates(properties)
         pk = None
         if properties:
             check_and_update_properties(properties, self.retrievers)
             pk = fetch_pks(properties, self.retrievers)
-        properties = update_prompt_enhanced(prompt, properties, pk, properties_original)
-        if return_pk:
             return properties, pk
         elif verbose:
             return properties, properties_original
         else:
-            return properties
 def load_json(file_path: str) -> dict:
@@ -518,24 +565,24 @@ def load_json(file_path: str) -> dict:
         return json.load(file)
-def create_extractor(schema: str = "src/conf/schema.json", db: SQLDatabase = "sqlite:///data/games.db", ):
     schema_config = load_json(schema)
     db = SQLDatabase.from_uri(db)
     pre_prompt = """Extract and save the relevant entities mentioned \
                     in the following passage together with their properties.
                     Only extract the properties mentioned in the 'information_extraction' function.
                     The questions are soccer related. game_event are things like yellow cards, goals, assists, freekick ect.
                     Generic properties like, "description", "home team", "away team", "game" ect should NOT be extracted.
                     If a property is not present and is not required in the function parameters, do not include it in the output.
                     If no properties are found, return an empty list.
                     Here are some exampels:
                     'How many goals did Henry score for Arsnl in the 2015 season?'
                     person_name': ['Henry'], 'team_name': [Arsnl],'year_season': ['2015'],
                     Passage:
                     {input}
     """
@@ -544,15 +591,19 @@ def create_extractor(schema: str = "src/conf/schema.json", db: SQLDatabase = "sq
 if __name__ == "__main__":
     schema_config = load_json("src/conf/schema.json")
     # Add game and league to the schema_config
     # prompter = PromptCleaner(db, schema_config, custom_extractor_prompt=extract_prompt)
     prompter = create_extractor("src/conf/schema.json", "sqlite:///data/games.db")
-    prompt= prompter.clean("Give me goals, shots on target, shots off target and corners from the game between ManU and Swansa")
     print(prompt)

     os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
     os.environ[
         'LANGCHAIN_API_KEY'] = os.getenv("LANGSMITH_API_KEY")
+    os.environ['LANGCHAIN_PROJECT'] = os.getenv('LANGSMITH_PROJECT')
+db_uri = os.getenv('DATABASE_PATH')
+db_uri = f"sqlite:///{db_uri}"
+db = SQLDatabase.from_uri(db_uri)
 # from langchain_anthropic import ChatAnthropic
 class Extractor():
     # llm = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0)
+    # gpt-3.5-turbo
     def __init__(self, model="gpt-3.5-turbo-0125", schema_config=None, custom_extractor_prompt=None):
         # model = "gpt-4-0125-preview"
         if custom_extractor_prompt:
         # Get matches and their scores, limited by the specified 'limit'
         matches = process.extract(prompt, self.response, limit=limit)
         filtered_matches = [match for match in matches if match[1] >= threshold]
         # If no matches meet the threshold, return the list of all matches' strings
             # Fix for wrong properties being returned
             return [match[0] for match in matches if match[1] >= low_threshold]
         # If there's only one match meeting the threshold, return it as a string
         if len(filtered_matches) == 1:
             return filtered_matches[0][0]  # Return the matched string directly
             print("No close matches found. Please try again or type 'quit' to stop.")
+def check_and_update_properties(properties_list, retrievers, method="fuzzy", input_func="input"):
     """
     Checks and updates the properties in the properties list based on close matches found in the database.
     The function iterates through each property in each property dictionary within the list,
     The function updates the properties_list in place based on user choices for updating property values
     with close matches found by the retrievers.
     """
+    return_list = []
     for index, properties in enumerate(properties_list):
         for property_name, retriever in retrievers.items():  # Iterate using items to get both key and value
             property_values = properties.get(property_name, [])
                         updated_property_values.append(augmented_value)
                         continue
                 # Since property_value is now expected to be a list, we handle each value individually
+                if input_func == "chainlit":
+                    n = 5
+                else:
+                    n = 3
+                top_matches = retriever.find_close_matches(value, method=method, n=n)
                 # Check if the closest match is the same as the current value
                 if top_matches and top_matches[0] == value:
                     updated_property_values.append(top_matches)
                     properties[property_name] = updated_property_values
                     continue
+                if input_func == "input":
+                    print(f"\nCurrent {property_name}: {value}")
+                    for i, match in enumerate(top_matches, start=1):
+                        print(f"[{i}] {match}")
+                    print("[4] Enter new value")
+                    # hmm = input(f"Fix for Pycharm, press enter to continue")
+                    choice = input(f"Select the best match for {property_name} (1-4): ")
+                    if choice in ['1', '2', '3']:
+                        selected_match = top_matches[int(choice) - 1]
+                        updated_property_values.append(selected_match)  # Update with the selected match
+                        print(f"Updated {property_name} to {selected_match}")
+                    elif choice == '4':
+                        # Allow re-entry of value for this specific item
+                        recheck_property_value(properties, property_name, value, retrievers, input_func)
+                        # Note: Implement recheck_property_value to handle individual value updates within the list
+                    else:
+                        print("Invalid selection. Property not updated.")
+                        updated_property_values.append(value)  # Keep the original value
+                elif input_func == "chainlit":  # If we use UI, just return the list of top matches, and then let the user select
+                    options = {property_name: value, "top_matches": top_matches}
+                    return_list.append(options)
             # Update the entire list for the property after processing all values
             properties[property_name] = updated_property_values
+    if input_func == "chainlit":
+        return properties, return_list
+    else:
+        return properties
 # Function to remove duplicates
 def remove_duplicates(dicts):
     return all_pk_attributes
+# def update_prompt(prompt, properties, pk, properties_original):
+#     # Replace the original prompt with the updated properties and pk
+#     prompt = prompt.replace("{{properties}}", str(properties))
+#     prompt = prompt.replace("{{pk}}", str(pk))
+#     return prompt
+def update_prompt(prompt, properties, pk, properties_original, retrievers):
     updated_info = ""
     for prop, pk_info, prop_orig in zip(properties, pk, properties_original):
         for key in prop.keys():
             # Extract original and updated values
+            if key in retrievers:
+                # Fetch the primary key using the retriever for the current property
+                table = retrievers[key].table
             orig_values = prop_orig.get(key, [])
             updated_values = prop.get(key, [])
                 if orig_value != updated_value and pk_value:
                     update_statement = f"\n- {orig_value} (now referred to as {updated_value}) has a primary key: {pk_value}."
                 elif orig_value != updated_value:
+                    update_statement = f"\n- {orig_value} (now referred to as {updated_value}."
                 elif pk_value:
                     update_statement = f"\n- {orig_value} has a primary key: {pk_value}."
+                elif orig_value == updated_value and pk_value:
+                    update_statement = f"\n- {orig_value} has a primary key: {pk_value}."
+                elif orig_value == updated_value:
+                    update_statement = f"\n- {orig_value}."
                 updated_info += update_statement
         check_and_update_properties(properties, retrievers)
         pk = fetch_pks(properties, retrievers)
+    properties = update_prompt(prompt, properties, pk, properties_original)
     return properties, pk
         self.schema_config = schema_config
         self.retrievers = setup_retrievers(self.db, self.schema_config)
         self.cust_extractor_prompt = custom_extractor_prompt
+        self.properties_original = None
+    def clean(self, prompt, return_pk=False, test=False, verbose=False):
         """
         Processes the given prompt to extract properties, remove duplicates, update the properties
         based on close matches within the database, and fetch primary keys for these properties.
             properties = extract_properties(prompt, self.schema_config)
         # Keep original properties for later use
         properties_original = deepcopy(properties)
         if test:
             return properties_original
         # Remove duplicates - Happens when there are more than one player or team in the prompt
         # properties = remove_duplicates(properties)
         pk = None
+        # VALIDATE PROPERTIES
         if properties:
             check_and_update_properties(properties, self.retrievers)
             pk = fetch_pks(properties, self.retrievers)
+        properties = update_prompt(prompt=prompt, properties=properties, pk=pk, properties_original=properties_original,
+                                   retrievers=self.retrievers)
+        # Prepare additional data if requested
+        if return_pk and verbose:
+            return (properties, pk), (properties, properties_original)
+        elif return_pk:
             return properties, pk
         elif verbose:
             return properties, properties_original
+        return properties
+    def extract_chainlit(self, prompt):
+        if self.cust_extractor_prompt:
+            properties = extract_properties(prompt, self.schema_config, self.cust_extractor_prompt)
         else:
+            properties = extract_properties(prompt, self.schema_config)
+        self.properties_original = deepcopy(properties)
+        return properties
+    def validate_chainlit(self, properties):
+        properties, need_val = check_and_update_properties(properties, self.retrievers, input_func="chainlit")
+        return properties, need_val
+    def build_prompt_chainlit(self, properties, prompt):
+        pk = None
+        # self.properties_original= deepcopy(properties)
+        if properties:
+            pk = fetch_pks(properties, self.retrievers)
+        prompt_new = update_prompt(prompt, properties, pk, self.properties_original, self.retrievers)
+        return prompt_new
 def load_json(file_path: str) -> dict:
         return json.load(file)
+def create_extractor(schema: str = "src/conf/schema.json", db: SQLDatabase = db_uri):
     schema_config = load_json(schema)
     db = SQLDatabase.from_uri(db)
     pre_prompt = """Extract and save the relevant entities mentioned \
                     in the following passage together with their properties.
                     Only extract the properties mentioned in the 'information_extraction' function.
                     The questions are soccer related. game_event are things like yellow cards, goals, assists, freekick ect.
                     Generic properties like, "description", "home team", "away team", "game" ect should NOT be extracted.
                     If a property is not present and is not required in the function parameters, do not include it in the output.
                     If no properties are found, return an empty list.
                     Here are some exampels:
                     'How many goals did Henry score for Arsnl in the 2015 season?'
                     person_name': ['Henry'], 'team_name': [Arsnl],'year_season': ['2015'],
                     Passage:
                     {input}
     """
 if __name__ == "__main__":
     schema_config = load_json("src/conf/schema.json")
     # Add game and league to the schema_config
     # prompter = PromptCleaner(db, schema_config, custom_extractor_prompt=extract_prompt)
     prompter = create_extractor("src/conf/schema.json", "sqlite:///data/games.db")
+    prompt = prompter.clean(
+        "Give me goals, shots on target, shots off target and corners from the game between ManU and Swansa and Manchester City")
     print(prompt)
+    # ex = create_extractor()
+    #
+    # val_list = [{'person_name': ['Cristiano Ronaldo'], 'team_name': ['Manchester City']}]
+    # user_prompt = "Did ronaldo play for city?"
+    # p = ex.build_prompt_chainlit(val_list, user_prompt)
+    # print(p)

src/sql_chain.py CHANGED Viewed

@@ -29,7 +29,7 @@ if os.getenv('LANGSMITH'):
     os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
     os.environ[
         'LANGCHAIN_API_KEY'] = os.getenv("LANGSMITH_API_KEY")
-    os.environ['LANGCHAIN_PROJECT'] = 'master-theses'
 def load_json(file_path: str) -> dict:
@@ -38,7 +38,8 @@ def load_json(file_path: str) -> dict:
 class SqlChain:
-    def __init__(self, few_shot_prompts: str, llm_model="gpt-3.5-turbo", db_uri="sqlite:///data/games.db", few_shot_k=2, verbose=True):
         self.llm = ChatOpenAI(model=llm_model, temperature=0)
         self.db = SQLDatabase.from_uri(db_uri)
         self.few_shot_k = few_shot_k
@@ -50,13 +51,12 @@ class SqlChain:
             db=self.db,
             prompt=self.full_prompt,
             max_iterations=10,
-            verbose=verbose,
             agent_type="openai-tools",
             # Default to 10 examples - Can be overwritten with the prompt
             top_k=30,
         )
     def _set_up_few_shot_prompts(self, few_shot_prompts: dict) -> None:
         few_shots = SemanticSimilarityExampleSelector.from_examples(
             few_shot_prompts,
@@ -68,6 +68,7 @@ class SqlChain:
         return few_shots
     def few_prompt_construct(self, query: str, top_k=5, dialect="SQLite") -> str:
         system_prefix = """You are an agent designed to interact with a SQL database.
         Given an input question, create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer.
         ALWAYS query the database before returning an answer.
@@ -77,7 +78,7 @@ class SqlChain:
         You have access to tools for interacting with the database.
         Only use the given tools. Only use the information returned by the tools to construct your final answer.
         You MUST double check your query before executing it. If you get an error while executing a query, rewrite the query and try again.
         DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.) to the database.
         If the question does not seem related to the database, just return 'I don't know' as the answer.
@@ -86,10 +87,17 @@ class SqlChain:
         Here are some examples of user inputs and their corresponding SQL queries. They are tested and works.
         Use them as a guide when creating your own queries:"""
         SUFFIX = """Begin!
             Question: {input}
-            Thought: I should look at the tables in the database to see what I can query.  Then I should query the schema of the most relevant tables.
             I will not stop until I query the database and return the answer.
             {agent_scratchpad}"""
@@ -117,6 +125,7 @@ class SqlChain:
                 "agent_scratchpad": [],
             }
         )
     def prompt_no_few_shot(self, query: str, dialect="SQLite") -> str:
         system_prefix = """You are an agent designed to interact with a SQL database.
         Given an input question, create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer.
@@ -134,10 +143,22 @@ class SqlChain:
         return f"{system_prefix}\n{query}"
-    def ask(self, query: str, few_prompt:bool=True) -> str:
         if few_prompt:
             self.few_prompt_construct(query)
             return self.agent.invoke({"input": self.full_prompt}), self.full_prompt
@@ -146,15 +167,19 @@ class SqlChain:
             return self.agent.invoke(self.prompt_no_few_shot(query)), self.prompt_no_few_shot(query)
 def create_agent(few_shot_prompts: str = "src/conf/sqls.json", llm_model="gpt-3.5-turbo-0125",
-                 db_uri="sqlite:///data/games.db", few_shot_k=2, verbose=True):
     """ Create an agent with the given few_shot_prompts, llm_model and db_uri
      Call it with agent.ask(prompt)"""
-    return SqlChain(few_shot_prompts, llm_model, db_uri, few_shot_k, verbose)
 if __name__ == "__main__":
     chain = SqlChain("src/conf/sqls.json")
-    chain.ask("Is Manchester United in the database?", False)

     os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
     os.environ[
         'LANGCHAIN_API_KEY'] = os.getenv("LANGSMITH_API_KEY")
+    os.environ['LANGCHAIN_PROJECT'] = os.getenv('LANGSMITH_PROJECT')
 def load_json(file_path: str) -> dict:
 class SqlChain:
+    def __init__(self, few_shot_prompts: str, llm_model="gpt-3.5-turbo", db_uri="sqlite:///data/games.db",
+                 few_shot_k=2):
         self.llm = ChatOpenAI(model=llm_model, temperature=0)
         self.db = SQLDatabase.from_uri(db_uri)
         self.few_shot_k = few_shot_k
             db=self.db,
             prompt=self.full_prompt,
             max_iterations=10,
+            verbose=True,
             agent_type="openai-tools",
             # Default to 10 examples - Can be overwritten with the prompt
             top_k=30,
         )
     def _set_up_few_shot_prompts(self, few_shot_prompts: dict) -> None:
         few_shots = SemanticSimilarityExampleSelector.from_examples(
             few_shot_prompts,
         return few_shots
     def few_prompt_construct(self, query: str, top_k=5, dialect="SQLite") -> str:
         system_prefix = """You are an agent designed to interact with a SQL database.
         Given an input question, create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer.
         ALWAYS query the database before returning an answer.
         You have access to tools for interacting with the database.
         Only use the given tools. Only use the information returned by the tools to construct your final answer.
         You MUST double check your query before executing it. If you get an error while executing a query, rewrite the query and try again.
         DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.) to the database.
         If the question does not seem related to the database, just return 'I don't know' as the answer.
         Here are some examples of user inputs and their corresponding SQL queries. They are tested and works.
         Use them as a guide when creating your own queries:"""
+        # SUFFIX = """Begin!
+        #
+        #     Question: {input}
+        #     Thought: I should look at the tables in the database to see what I can query.  Then I should query the schema of the most relevant tables.
+        #     I will not stop until I query the database and return the answer.
+        #     {agent_scratchpad}"""
         SUFFIX = """Begin!
             Question: {input}
+            Thought: I should look at the examples provided and see if I can use them to identify tables and how to build the query.
+            Then I should query the schema of the most relevant tables.
             I will not stop until I query the database and return the answer.
             {agent_scratchpad}"""
                 "agent_scratchpad": [],
             }
         )
     def prompt_no_few_shot(self, query: str, dialect="SQLite") -> str:
         system_prefix = """You are an agent designed to interact with a SQL database.
         Given an input question, create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer.
         return f"{system_prefix}\n{query}"
+    def ask(self, query: str, few_prompt: bool = True, rag_test=False) -> str:
+        if rag_test:
+            self.few_prompt_construct(query)
+            # Alter the self.full_prompt to only include whats added by the RAG system
+            # Get content in self.full_prompt[messages][0][content]
+            prompt = self.full_prompt.messages
+            prompt = prompt[0].content
+            prompt = prompt.split("Use them as a guide when creating your own queries:\n\n")[1]
+            # Then remove everything after \n\nBegin!\n\n
+            prompt = prompt.split("\n\nBegin!\n\n")[0]
+            # Lets split it to a list. One element for each "User input: {input}\nSQL query: {query}"
+            prompt = prompt.split("User input: ")
+            # Then remove the first element
+            prompt = prompt[1:]
+            return prompt
         if few_prompt:
             self.few_prompt_construct(query)
             return self.agent.invoke({"input": self.full_prompt}), self.full_prompt
             return self.agent.invoke(self.prompt_no_few_shot(query)), self.prompt_no_few_shot(query)
 def create_agent(few_shot_prompts: str = "src/conf/sqls.json", llm_model="gpt-3.5-turbo-0125",
+                 db_uri="config", few_shot_k=2):
     """ Create an agent with the given few_shot_prompts, llm_model and db_uri
      Call it with agent.ask(prompt)"""
+    if db_uri == "config":
+        db_uri = os.getenv('DATABASE_PATH')
+        db_uri = f"sqlite:///{db_uri}"
+        # print(db_uri)
+        # print("sqlite:///data/games.db")
+        # exit(0)
+    return SqlChain(few_shot_prompts, llm_model, db_uri, few_shot_k)
 if __name__ == "__main__":
     chain = SqlChain("src/conf/sqls.json")
+    chain.ask("Is Manchester United in the database?", rag_test=True)