refactoring
Browse files- .cursor/rules/python.mdc +124 -0
- data/leaderboard.csv +18 -18
- src/app.py +94 -25
- src/judge.py +16 -9
- src/ui.py +23 -2
.cursor/rules/python.mdc
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
description:
|
| 3 |
+
globs: **/*.py, src/**/*.py, tests/**/*.py
|
| 4 |
+
alwaysApply: false
|
| 5 |
+
---
|
| 6 |
+
---
|
| 7 |
+
description: Python best practices and patterns for modern software development with Flask and SQLite
|
| 8 |
+
globs: **/*.py, src/**/*.py, tests/**/*.py
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Python Best Practices
|
| 12 |
+
|
| 13 |
+
## Project Structure
|
| 14 |
+
- Use src-layout with `src/your_package_name/`
|
| 15 |
+
- Place tests in `tests/` directory parallel to `src/`
|
| 16 |
+
- Keep configuration in `config/` or as environment variables
|
| 17 |
+
- Store requirements in `requirements.txt` or `pyproject.toml`
|
| 18 |
+
- Place static files in `static/` directory
|
| 19 |
+
- Use `templates/` for Jinja2 templates
|
| 20 |
+
|
| 21 |
+
## Code Style
|
| 22 |
+
- Follow Black code formatting
|
| 23 |
+
- Use isort for import sorting
|
| 24 |
+
- Follow PEP 8 naming conventions:
|
| 25 |
+
- snake_case for functions and variables
|
| 26 |
+
- PascalCase for classes
|
| 27 |
+
- UPPER_CASE for constants
|
| 28 |
+
- Maximum line length of 88 characters (Black default)
|
| 29 |
+
- Use absolute imports over relative imports
|
| 30 |
+
|
| 31 |
+
## Type Hints
|
| 32 |
+
- Use type hints for all function parameters and returns
|
| 33 |
+
- Import types from `typing` module
|
| 34 |
+
- Use `Optional[Type]` instead of `Type | None`
|
| 35 |
+
- Use `TypeVar` for generic types
|
| 36 |
+
- Define custom types in `types.py`
|
| 37 |
+
- Use `Protocol` for duck typing
|
| 38 |
+
|
| 39 |
+
## Flask Structure
|
| 40 |
+
- Use Flask factory pattern
|
| 41 |
+
- Organize routes using Blueprints
|
| 42 |
+
- Use Flask-SQLAlchemy for database
|
| 43 |
+
- Implement proper error handlers
|
| 44 |
+
- Use Flask-Login for authentication
|
| 45 |
+
- Structure views with proper separation of concerns
|
| 46 |
+
|
| 47 |
+
## Database
|
| 48 |
+
- Use SQLAlchemy ORM
|
| 49 |
+
- Implement database migrations with Alembic
|
| 50 |
+
- Use proper connection pooling
|
| 51 |
+
- Define models in separate modules
|
| 52 |
+
- Implement proper relationships
|
| 53 |
+
- Use proper indexing strategies
|
| 54 |
+
## Authentication
|
| 55 |
+
- Use Flask-Login for session management
|
| 56 |
+
- Implement Google OAuth using Flask-OAuth
|
| 57 |
+
- Hash passwords with bcrypt
|
| 58 |
+
- Use proper session security
|
| 59 |
+
- Implement CSRF protection
|
| 60 |
+
- Use proper role-based access control
|
| 61 |
+
|
| 62 |
+
## API Design
|
| 63 |
+
- Use Flask-RESTful for REST APIs
|
| 64 |
+
- Implement proper request validation
|
| 65 |
+
- Use proper HTTP status codes
|
| 66 |
+
- Handle errors consistently
|
| 67 |
+
- Use proper response formats
|
| 68 |
+
- Implement proper rate limiting
|
| 69 |
+
|
| 70 |
+
## Testing
|
| 71 |
+
- Use pytest for testing
|
| 72 |
+
- Write tests for all routes
|
| 73 |
+
- Use pytest-cov for coverage
|
| 74 |
+
- Implement proper fixtures
|
| 75 |
+
- Use proper mocking with pytest-mock
|
| 76 |
+
- Test all error scenarios
|
| 77 |
+
|
| 78 |
+
## Security
|
| 79 |
+
- Use HTTPS in production
|
| 80 |
+
- Implement proper CORS
|
| 81 |
+
- Sanitize all user inputs
|
| 82 |
+
- Use proper session configuration
|
| 83 |
+
- Implement proper logging
|
| 84 |
+
- Follow OWASP guidelines
|
| 85 |
+
|
| 86 |
+
## Performance
|
| 87 |
+
- Use proper caching with Flask-Caching
|
| 88 |
+
- Implement database query optimization
|
| 89 |
+
- Use proper connection pooling
|
| 90 |
+
- Implement proper pagination
|
| 91 |
+
- Use background tasks for heavy operations
|
| 92 |
+
- Monitor application performance
|
| 93 |
+
|
| 94 |
+
## Error Handling
|
| 95 |
+
- Create custom exception classes
|
| 96 |
+
- Use proper try-except blocks
|
| 97 |
+
- Implement proper logging
|
| 98 |
+
- Return proper error responses
|
| 99 |
+
- Handle edge cases properly
|
| 100 |
+
- Use proper error messages
|
| 101 |
+
|
| 102 |
+
## Documentation
|
| 103 |
+
- Use Google-style docstrings
|
| 104 |
+
- Document all public APIs
|
| 105 |
+
- Keep README.md updated
|
| 106 |
+
- Use proper inline comments
|
| 107 |
+
- Generate API documentation
|
| 108 |
+
- Document environment setup
|
| 109 |
+
|
| 110 |
+
## Development Workflow
|
| 111 |
+
- Use virtual environments (venv)
|
| 112 |
+
- Implement pre-commit hooks
|
| 113 |
+
- Use proper Git workflow
|
| 114 |
+
- Follow semantic versioning
|
| 115 |
+
- Use proper CI/CD practices
|
| 116 |
+
- Implement proper logging
|
| 117 |
+
|
| 118 |
+
## Dependencies
|
| 119 |
+
- Pin dependency versions
|
| 120 |
+
- Use requirements.txt for production
|
| 121 |
+
- Separate dev dependencies
|
| 122 |
+
- Use proper package versions
|
| 123 |
+
- Regularly update dependencies
|
| 124 |
+
- Check for security vulnerabilities
|
data/leaderboard.csv
CHANGED
|
@@ -1,31 +1,31 @@
|
|
| 1 |
judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license
|
|
|
|
| 2 |
claude-3-opus-latest,Claude 3 Opus,1531.9661669788793,2.0,0.0,2.0,Anthropic,Proprietary
|
| 3 |
mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
|
| 4 |
qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
|
| 5 |
meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,1515.2298601853572,1.0,0.0,1.0,Meta,Open Source
|
| 6 |
gpt-4-turbo,GPT-4 Turbo,1500.736306793522,1.0,1.0,2.0,OpenAI,Proprietary
|
| 7 |
-
meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
|
| 8 |
-
gemma-2-27b-it,Gemma 2 27B,1500.0,0.0,0.0,0.0,Google,Open Source
|
| 9 |
-
gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
|
| 10 |
-
qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
|
| 11 |
-
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1500.0,0.0,0.0,0.0,Mistral AI,Open Source
|
| 12 |
-
gpt-3.5-turbo,GPT-3.5 Turbo,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
|
| 13 |
-
atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
|
| 14 |
-
claude-3-5-haiku-latest,Claude 3.5 Haiku,1500.0,0.0,0.0,0.0,Anthropic,Proprietary
|
| 15 |
-
claude-3-sonnet-20240229,Claude 3 Sonnet,1500.0,0.0,0.0,0.0,Anthropic,Proprietary
|
| 16 |
deepseek-r1,DeepSeek R1,1500.0,0.0,0.0,0.0,DeepSeek,Open Source
|
| 17 |
-
judge1,EvalGPT,1500.0,0.0,0.0,0.0,OpenAI,Commercial
|
| 18 |
-
judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
|
| 19 |
-
judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
| 20 |
-
judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
| 21 |
-
judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
|
| 22 |
-
meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
|
| 23 |
-
meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
|
| 24 |
-
o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
|
| 25 |
deepseek-v3,DeepSeek V3,1500.0,0.0,0.0,0.0,DeepSeek,Open Source
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1499.263693206478,1.0,1.0,2.0,Meta,Open Source
|
| 27 |
claude-3-haiku-20240307,Claude 3 Haiku,1499.263693206478,1.0,1.0,2.0,Anthropic,Proprietary
|
| 28 |
gpt-4.1,GPT-4.1,1484.7701398146428,0.0,1.0,1.0,OpenAI,Proprietary
|
| 29 |
claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1484.0,0.0,1.0,1.0,Anthropic,Proprietary
|
| 30 |
gpt-4o,GPT-4o,1484.0,0.0,1.0,1.0,OpenAI,Proprietary
|
| 31 |
-
|
|
|
|
| 1 |
judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license
|
| 2 |
+
qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1649.8276600728695,21.0,2.0,23.0,Alibaba,Open Source
|
| 3 |
claude-3-opus-latest,Claude 3 Opus,1531.9661669788793,2.0,0.0,2.0,Anthropic,Proprietary
|
| 4 |
mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
|
| 5 |
qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
|
| 6 |
meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,1515.2298601853572,1.0,0.0,1.0,Meta,Open Source
|
| 7 |
gpt-4-turbo,GPT-4 Turbo,1500.736306793522,1.0,1.0,2.0,OpenAI,Proprietary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
deepseek-r1,DeepSeek R1,1500.0,0.0,0.0,0.0,DeepSeek,Open Source
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
deepseek-v3,DeepSeek V3,1500.0,0.0,0.0,0.0,DeepSeek,Open Source
|
| 10 |
+
o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
|
| 11 |
+
meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
|
| 12 |
+
meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
|
| 13 |
+
judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
|
| 14 |
+
judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
| 15 |
+
judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
| 16 |
+
judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
|
| 17 |
+
claude-3-sonnet-20240229,Claude 3 Sonnet,1500.0,0.0,0.0,0.0,Anthropic,Proprietary
|
| 18 |
+
claude-3-5-haiku-latest,Claude 3.5 Haiku,1500.0,0.0,0.0,0.0,Anthropic,Proprietary
|
| 19 |
+
atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
|
| 20 |
+
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1500.0,0.0,0.0,0.0,Mistral AI,Open Source
|
| 21 |
+
qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
|
| 22 |
+
gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
|
| 23 |
+
gemma-2-27b-it,Gemma 2 27B,1500.0,0.0,0.0,0.0,Google,Open Source
|
| 24 |
+
meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
|
| 25 |
+
judge1,EvalGPT,1500.0,0.0,0.0,0.0,OpenAI,Commercial
|
| 26 |
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1499.263693206478,1.0,1.0,2.0,Meta,Open Source
|
| 27 |
claude-3-haiku-20240307,Claude 3 Haiku,1499.263693206478,1.0,1.0,2.0,Anthropic,Proprietary
|
| 28 |
gpt-4.1,GPT-4.1,1484.7701398146428,0.0,1.0,1.0,OpenAI,Proprietary
|
| 29 |
claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1484.0,0.0,1.0,1.0,Anthropic,Proprietary
|
| 30 |
gpt-4o,GPT-4o,1484.0,0.0,1.0,1.0,OpenAI,Proprietary
|
| 31 |
+
gpt-3.5-turbo,GPT-3.5 Turbo,1318.2061729482512,0.0,21.0,21.0,OpenAI,Proprietary
|
src/app.py
CHANGED
|
@@ -10,6 +10,7 @@ from src.ui import UI
|
|
| 10 |
# Global state for evaluations
|
| 11 |
eval1: Optional[Dict[str, Any]] = None
|
| 12 |
eval2: Optional[Dict[str, Any]] = None
|
|
|
|
| 13 |
current_test_type: str = "grounding"
|
| 14 |
|
| 15 |
|
|
@@ -31,6 +32,18 @@ def initialize():
|
|
| 31 |
test_type,
|
| 32 |
judge_manager,
|
| 33 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
winner1_fn=lambda: select_winner("Evaluation 1", judge_manager),
|
| 35 |
winner2_fn=lambda: select_winner("Evaluation 2", judge_manager),
|
| 36 |
refresh_leaderboard_fn=lambda: judge_manager.leaderboard_df,
|
|
@@ -57,50 +70,106 @@ def submit_example(
|
|
| 57 |
output_text: str,
|
| 58 |
test_type: str,
|
| 59 |
judge_manager: JudgeManager,
|
| 60 |
-
) -> Tuple[str, str, Any, Any]:
|
| 61 |
-
"""
|
| 62 |
-
global eval1, eval2
|
| 63 |
|
| 64 |
try:
|
| 65 |
-
logger.info(f"
|
| 66 |
current_test_type = test_type
|
| 67 |
-
selected_judges = judge_manager.pick_random_judges()
|
| 68 |
-
eval1 = judge_manager.get_random_judges_evaluations(
|
| 69 |
-
input_text,
|
| 70 |
-
output_text,
|
| 71 |
-
test_type,
|
| 72 |
-
selected_judges[0],
|
| 73 |
-
)
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
test_type,
|
| 79 |
-
selected_judges[1],
|
| 80 |
-
)
|
| 81 |
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
| 83 |
return (
|
| 84 |
"Error: Not enough judges available",
|
| 85 |
"Error: Not enough judges available",
|
| 86 |
None,
|
| 87 |
None,
|
|
|
|
|
|
|
| 88 |
)
|
| 89 |
|
|
|
|
|
|
|
| 90 |
return (
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
gr.update(
|
| 94 |
-
gr.update(
|
|
|
|
|
|
|
| 95 |
)
|
| 96 |
except Exception as e:
|
| 97 |
-
logger.error(f"Error
|
| 98 |
return (
|
| 99 |
f"Error: {str(e)}",
|
| 100 |
f"Error: {str(e)}",
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
|
| 106 |
def select_winner(choice: str, judge_manager: JudgeManager) -> str:
|
|
|
|
| 10 |
# Global state for evaluations
|
| 11 |
eval1: Optional[Dict[str, Any]] = None
|
| 12 |
eval2: Optional[Dict[str, Any]] = None
|
| 13 |
+
selected_judges: list = []
|
| 14 |
current_test_type: str = "grounding"
|
| 15 |
|
| 16 |
|
|
|
|
| 32 |
test_type,
|
| 33 |
judge_manager,
|
| 34 |
),
|
| 35 |
+
evaluate1_fn=lambda input_text, output_text, test_type: get_evaluation1(
|
| 36 |
+
input_text,
|
| 37 |
+
output_text,
|
| 38 |
+
test_type,
|
| 39 |
+
judge_manager,
|
| 40 |
+
),
|
| 41 |
+
evaluate2_fn=lambda input_text, output_text, test_type: get_evaluation2(
|
| 42 |
+
input_text,
|
| 43 |
+
output_text,
|
| 44 |
+
test_type,
|
| 45 |
+
judge_manager,
|
| 46 |
+
),
|
| 47 |
winner1_fn=lambda: select_winner("Evaluation 1", judge_manager),
|
| 48 |
winner2_fn=lambda: select_winner("Evaluation 2", judge_manager),
|
| 49 |
refresh_leaderboard_fn=lambda: judge_manager.leaderboard_df,
|
|
|
|
| 70 |
output_text: str,
|
| 71 |
test_type: str,
|
| 72 |
judge_manager: JudgeManager,
|
| 73 |
+
) -> Tuple[str, str, Any, Any, Any, Any]:
|
| 74 |
+
"""Prepare for evaluation and select random judges."""
|
| 75 |
+
global selected_judges, current_test_type, eval1, eval2
|
| 76 |
|
| 77 |
try:
|
| 78 |
+
logger.info(f"Preparing evaluation for test type: {test_type}")
|
| 79 |
current_test_type = test_type
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
+
# Reset evaluations
|
| 82 |
+
eval1 = None
|
| 83 |
+
eval2 = None
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
+
# Select random judges
|
| 86 |
+
selected_judges = judge_manager.pick_random_judges()
|
| 87 |
+
|
| 88 |
+
if len(selected_judges) < 2:
|
| 89 |
return (
|
| 90 |
"Error: Not enough judges available",
|
| 91 |
"Error: Not enough judges available",
|
| 92 |
None,
|
| 93 |
None,
|
| 94 |
+
None,
|
| 95 |
+
gr.update(visible=False),
|
| 96 |
)
|
| 97 |
|
| 98 |
+
# Show loading messages while evaluations are in progress
|
| 99 |
+
status_text = "Evaluations starting... Both judges will evaluate in parallel."
|
| 100 |
return (
|
| 101 |
+
"Loading evaluation 1...",
|
| 102 |
+
"Loading evaluation 2...",
|
| 103 |
+
gr.update(value=input_text),
|
| 104 |
+
gr.update(value=output_text),
|
| 105 |
+
gr.update(value=test_type),
|
| 106 |
+
gr.update(visible=True, value=status_text),
|
| 107 |
)
|
| 108 |
except Exception as e:
|
| 109 |
+
logger.error(f"Error preparing evaluation: {e}")
|
| 110 |
return (
|
| 111 |
f"Error: {str(e)}",
|
| 112 |
f"Error: {str(e)}",
|
| 113 |
+
gr.update(value=input_text),
|
| 114 |
+
gr.update(value=output_text),
|
| 115 |
+
gr.update(value=test_type),
|
| 116 |
+
gr.update(visible=False),
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def get_evaluation1(
|
| 121 |
+
input_text: str,
|
| 122 |
+
output_text: str,
|
| 123 |
+
test_type: str,
|
| 124 |
+
judge_manager: JudgeManager,
|
| 125 |
+
) -> Tuple[str, Any]:
|
| 126 |
+
"""Get evaluation from the first judge."""
|
| 127 |
+
global eval1, selected_judges
|
| 128 |
+
|
| 129 |
+
try:
|
| 130 |
+
if not selected_judges or len(selected_judges) < 1:
|
| 131 |
+
return "No judges selected", gr.update(visible=False)
|
| 132 |
+
|
| 133 |
+
logger.info(f"Starting evaluation 1 with judge {selected_judges[0]['name']}")
|
| 134 |
+
# Get evaluation from the first judge
|
| 135 |
+
eval1 = judge_manager.get_evaluation(
|
| 136 |
+
selected_judges[0],
|
| 137 |
+
input_text,
|
| 138 |
+
output_text,
|
| 139 |
+
test_type,
|
| 140 |
)
|
| 141 |
+
logger.info("Completed evaluation 1")
|
| 142 |
+
|
| 143 |
+
# Make the selection button visible once the evaluation is ready
|
| 144 |
+
return eval1["display_evaluation"], gr.update(visible=True)
|
| 145 |
+
except Exception as e:
|
| 146 |
+
logger.error(f"Error getting evaluation 1: {e}")
|
| 147 |
+
return f"Error: {str(e)}", gr.update(visible=False)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def get_evaluation2(
|
| 151 |
+
input_text: str,
|
| 152 |
+
output_text: str,
|
| 153 |
+
test_type: str,
|
| 154 |
+
judge_manager: JudgeManager,
|
| 155 |
+
) -> Tuple[str, Any]:
|
| 156 |
+
"""Get evaluation from the second judge."""
|
| 157 |
+
global eval2, selected_judges
|
| 158 |
+
|
| 159 |
+
try:
|
| 160 |
+
if not selected_judges or len(selected_judges) < 2:
|
| 161 |
+
return "No judges selected", gr.update(visible=False)
|
| 162 |
+
|
| 163 |
+
logger.info(f"Starting evaluation 2 with judge {selected_judges[1]['name']}")
|
| 164 |
+
# Get evaluation from the second judge
|
| 165 |
+
eval2 = judge_manager.get_evaluation(selected_judges[1], input_text, output_text, test_type)
|
| 166 |
+
logger.info("Completed evaluation 2")
|
| 167 |
+
|
| 168 |
+
# Make the selection button visible once the evaluation is ready
|
| 169 |
+
return eval2["display_evaluation"], gr.update(visible=True)
|
| 170 |
+
except Exception as e:
|
| 171 |
+
logger.error(f"Error getting evaluation 2: {e}")
|
| 172 |
+
return f"Error: {str(e)}", gr.update(visible=False)
|
| 173 |
|
| 174 |
|
| 175 |
def select_winner(choice: str, judge_manager: JudgeManager) -> str:
|
src/judge.py
CHANGED
|
@@ -101,8 +101,10 @@ class JudgeManager:
|
|
| 101 |
temperature=0.2,
|
| 102 |
max_tokens=500,
|
| 103 |
)
|
| 104 |
-
# Default fallback
|
| 105 |
evaluation = api_response.choices[0].message.content
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
# Format the evaluation
|
| 108 |
eval_prefix = f"Evaluation by {judge['name']} (ID: {judge['id']}):\n\n"
|
|
@@ -137,8 +139,11 @@ AI RESPONSE:
|
|
| 137 |
|
| 138 |
Please evaluate this response carefully and provide your assessment."""
|
| 139 |
|
| 140 |
-
def pick_random_judges(self) ->
|
| 141 |
"""Pick two random judges"""
|
|
|
|
|
|
|
|
|
|
| 142 |
return random.sample(self.judges, 2)
|
| 143 |
|
| 144 |
def get_random_judges_evaluations(
|
|
@@ -146,7 +151,7 @@ Please evaluate this response carefully and provide your assessment."""
|
|
| 146 |
input_text: str,
|
| 147 |
output_text: str,
|
| 148 |
test_type: str,
|
| 149 |
-
|
| 150 |
) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, Any]]]:
|
| 151 |
"""Get evaluations from two random judges"""
|
| 152 |
if len(self.judges) < 2:
|
|
@@ -154,12 +159,14 @@ Please evaluate this response carefully and provide your assessment."""
|
|
| 154 |
return None, None
|
| 155 |
|
| 156 |
# Get evaluations from the judges
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
|
|
|
|
|
|
| 163 |
|
| 164 |
def update_leaderboard(self, winner_id: str, loser_id: str) -> pd.DataFrame:
|
| 165 |
"""Update the leaderboard after a comparison"""
|
|
|
|
| 101 |
temperature=0.2,
|
| 102 |
max_tokens=500,
|
| 103 |
)
|
|
|
|
| 104 |
evaluation = api_response.choices[0].message.content
|
| 105 |
+
else:
|
| 106 |
+
# Default fallback
|
| 107 |
+
evaluation = f"No evaluation provider for {judge['provider']}"
|
| 108 |
|
| 109 |
# Format the evaluation
|
| 110 |
eval_prefix = f"Evaluation by {judge['name']} (ID: {judge['id']}):\n\n"
|
|
|
|
| 139 |
|
| 140 |
Please evaluate this response carefully and provide your assessment."""
|
| 141 |
|
| 142 |
+
def pick_random_judges(self) -> List[Dict[str, Any]]:
|
| 143 |
"""Pick two random judges"""
|
| 144 |
+
if len(self.judges) < 2:
|
| 145 |
+
logger.error("Not enough judges available for comparison")
|
| 146 |
+
return []
|
| 147 |
return random.sample(self.judges, 2)
|
| 148 |
|
| 149 |
def get_random_judges_evaluations(
|
|
|
|
| 151 |
input_text: str,
|
| 152 |
output_text: str,
|
| 153 |
test_type: str,
|
| 154 |
+
selected_judge: Dict[str, Any],
|
| 155 |
) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, Any]]]:
|
| 156 |
"""Get evaluations from two random judges"""
|
| 157 |
if len(self.judges) < 2:
|
|
|
|
| 159 |
return None, None
|
| 160 |
|
| 161 |
# Get evaluations from the judges
|
| 162 |
+
evaluation = self.get_evaluation(
|
| 163 |
+
selected_judge,
|
| 164 |
+
input_text,
|
| 165 |
+
output_text,
|
| 166 |
+
test_type,
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
return evaluation
|
| 170 |
|
| 171 |
def update_leaderboard(self, winner_id: str, loser_id: str) -> pd.DataFrame:
|
| 172 |
"""Update the leaderboard after a comparison"""
|
src/ui.py
CHANGED
|
@@ -13,6 +13,8 @@ class UI:
|
|
| 13 |
self,
|
| 14 |
refresh_fn: Callable,
|
| 15 |
submit_fn: Callable,
|
|
|
|
|
|
|
| 16 |
winner1_fn: Callable,
|
| 17 |
winner2_fn: Callable,
|
| 18 |
refresh_leaderboard_fn: Callable,
|
|
@@ -20,6 +22,8 @@ class UI:
|
|
| 20 |
):
|
| 21 |
self.refresh_fn = refresh_fn
|
| 22 |
self.submit_fn = submit_fn
|
|
|
|
|
|
|
| 23 |
self.winner1_fn = winner1_fn
|
| 24 |
self.winner2_fn = winner2_fn
|
| 25 |
self.refresh_leaderboard_fn = refresh_leaderboard_fn
|
|
@@ -97,6 +101,7 @@ class UI:
|
|
| 97 |
input_text = gr.Textbox(label="Input", lines=4)
|
| 98 |
output_text = gr.Textbox(label="Output", lines=6)
|
| 99 |
submit_button = gr.Button("Get Judge Evaluations")
|
|
|
|
| 100 |
|
| 101 |
with gr.Row():
|
| 102 |
with gr.Column():
|
|
@@ -129,10 +134,26 @@ class UI:
|
|
| 129 |
[input_text, output_text],
|
| 130 |
)
|
| 131 |
|
| 132 |
-
|
|
|
|
| 133 |
self.submit_fn,
|
| 134 |
[input_text, output_text, test_type_dropdown],
|
| 135 |
-
[evaluation1, evaluation2,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
)
|
| 137 |
|
| 138 |
select_eval1.click(
|
|
|
|
| 13 |
self,
|
| 14 |
refresh_fn: Callable,
|
| 15 |
submit_fn: Callable,
|
| 16 |
+
evaluate1_fn: Callable,
|
| 17 |
+
evaluate2_fn: Callable,
|
| 18 |
winner1_fn: Callable,
|
| 19 |
winner2_fn: Callable,
|
| 20 |
refresh_leaderboard_fn: Callable,
|
|
|
|
| 22 |
):
|
| 23 |
self.refresh_fn = refresh_fn
|
| 24 |
self.submit_fn = submit_fn
|
| 25 |
+
self.evaluate1_fn = evaluate1_fn
|
| 26 |
+
self.evaluate2_fn = evaluate2_fn
|
| 27 |
self.winner1_fn = winner1_fn
|
| 28 |
self.winner2_fn = winner2_fn
|
| 29 |
self.refresh_leaderboard_fn = refresh_leaderboard_fn
|
|
|
|
| 101 |
input_text = gr.Textbox(label="Input", lines=4)
|
| 102 |
output_text = gr.Textbox(label="Output", lines=6)
|
| 103 |
submit_button = gr.Button("Get Judge Evaluations")
|
| 104 |
+
status_message = gr.Markdown(visible=False)
|
| 105 |
|
| 106 |
with gr.Row():
|
| 107 |
with gr.Column():
|
|
|
|
| 134 |
[input_text, output_text],
|
| 135 |
)
|
| 136 |
|
| 137 |
+
# Modified submit to prepare for evaluation and trigger both evaluations in parallel
|
| 138 |
+
submit_event = submit_button.click(
|
| 139 |
self.submit_fn,
|
| 140 |
[input_text, output_text, test_type_dropdown],
|
| 141 |
+
[evaluation1, evaluation2, input_text, output_text, test_type_dropdown, status_message],
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
# Start both evaluations simultaneously (in parallel) after submit completes
|
| 145 |
+
submit_event.then(
|
| 146 |
+
self.evaluate1_fn,
|
| 147 |
+
[input_text, output_text, test_type_dropdown],
|
| 148 |
+
[evaluation1, select_eval1],
|
| 149 |
+
queue=False, # Run immediately without waiting in queue
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
submit_event.then(
|
| 153 |
+
self.evaluate2_fn,
|
| 154 |
+
[input_text, output_text, test_type_dropdown],
|
| 155 |
+
[evaluation2, select_eval2],
|
| 156 |
+
queue=False, # Run immediately without waiting in queue
|
| 157 |
)
|
| 158 |
|
| 159 |
select_eval1.click(
|