Alex-q-z commited on
Commit
4b73981
Β·
1 Parent(s): 200e56a

[Refactor] Clean up the frontend for correct visualization and better readabilitt

Browse files
app.py CHANGED
@@ -37,14 +37,14 @@ def filter_and_display(selected_columns, model_types, datasets, stage):
37
  # Adjust aggregation based on stage
38
  if stage == "decode":
39
  filtered = filtered.groupby(["Method", "Model"], as_index=False).agg({
40
- "Throughput (token/s)": "mean",
41
  "Quality": "mean",
 
42
  "Link": "first"
43
  })
44
  else:
45
  filtered = filtered.groupby(["Method", "Model"], as_index=False).agg({
46
  "Quality": "mean",
47
- "TTFT": "mean",
48
  "Link": "first"
49
  })
50
 
@@ -55,15 +55,33 @@ def filter_and_display(selected_columns, model_types, datasets, stage):
55
  def create_prefill_visualization(filtered_data):
56
  if filtered_data.empty:
57
  return None
58
- fig = px.bar(filtered_data, x='Model', y='Quality', color='Method', barmode='group',
59
- title='Prefill Stage: Quality by Model and Method')
 
 
 
 
 
 
 
 
 
60
  return fig
61
 
62
  def create_decode_visualization(filtered_data):
63
  if filtered_data.empty:
64
  return None
65
- fig = px.bar(filtered_data, x='Model', y='Throughput (token/s)', color='Method', barmode='group',
66
- title='Decode Stage: Throughput by Model and Method')
 
 
 
 
 
 
 
 
 
67
  return fig
68
 
69
  # Load the data from the /data folder
@@ -75,27 +93,26 @@ def create_gradio_app():
75
  with gr.Blocks() as app:
76
  with gr.Row():
77
  gr.Markdown(
78
- """# KV Cache Benchmark
79
- ### Demo leaderboard
80
- This demo leaderboard allows users to explore and compare different KV cache implementations across various models and datasets. It provides interactive filtering options and real-time updates of benchmark results, including visualization of Quality and TTFT metrics.
81
- """)
82
 
83
  with gr.Tabs():
84
  with gr.TabItem("KV Cache Benchmark"):
85
  # Prefill-stage selection
86
  with gr.Row():
87
- gr.Markdown("## Prefill-stage Selection")
88
  with gr.Row():
89
  with gr.Column():
90
- gr.Markdown("#### Select Columns to Display")
91
  prefill_columns_to_display = gr.CheckboxGroup(
92
- choices=["Quality", "TTFT", "Link"],
93
- label="Columns",
94
- value=["Quality", "TTFT"]
95
  )
96
 
97
  with gr.Column():
98
- gr.Markdown("#### Model Types")
99
  prefill_model_types = gr.CheckboxGroup(
100
  choices=list(data["Model"].unique()),
101
  label="Model Types",
@@ -103,7 +120,7 @@ This demo leaderboard allows users to explore and compare different KV cache imp
103
  )
104
 
105
  with gr.Column():
106
- gr.Markdown("#### Datasets")
107
  prefill_datasets = gr.CheckboxGroup(
108
  choices=list(data[data["Stage"] == "prefill"]["Dataset"].unique()),
109
  label="Datasets",
@@ -112,21 +129,21 @@ This demo leaderboard allows users to explore and compare different KV cache imp
112
 
113
  # Prefill-stage compression results
114
  with gr.Row():
115
- gr.Markdown("## Prefill-stage Compression Results")
116
 
117
- prefill_results = gr.Dataframe(value=filter_and_display(["Quality", "TTFT"], list(data["Model"].unique()), list(data["Dataset"].unique()), "prefill"), headers=["Method", "Model", "Quality", "TTFT", "Link"])
118
 
119
  # Prefill-stage visualization
120
  with gr.Row():
121
- gr.Markdown("### Prefill-stage Visualization")
122
- prefill_plot = gr.Plot(value=create_prefill_visualization(filter_and_display(["Quality"], list(data["Model"].unique()), list(data["Dataset"].unique()), "prefill")))
123
 
124
  # Decode-stage selection
125
  with gr.Row():
126
- gr.Markdown("## Decode-stage Selection")
127
  with gr.Row():
128
  with gr.Column():
129
- gr.Markdown("#### Select Columns to Display")
130
  decode_columns_to_display = gr.CheckboxGroup(
131
  choices=["Throughput (token/s)", "Quality", "Link"],
132
  label="Columns",
@@ -134,7 +151,7 @@ This demo leaderboard allows users to explore and compare different KV cache imp
134
  )
135
 
136
  with gr.Column():
137
- gr.Markdown("#### Model Types")
138
  decode_model_types = gr.CheckboxGroup(
139
  choices=list(data["Model"].unique()),
140
  label="Model Types",
@@ -142,7 +159,7 @@ This demo leaderboard allows users to explore and compare different KV cache imp
142
  )
143
 
144
  with gr.Column():
145
- gr.Markdown("#### Datasets")
146
  decode_datasets = gr.CheckboxGroup(
147
  choices=list(data[data["Stage"] == "decode"]["Dataset"].unique()),
148
  label="Datasets",
@@ -151,14 +168,14 @@ This demo leaderboard allows users to explore and compare different KV cache imp
151
 
152
  # Decode-stage compression results
153
  with gr.Row():
154
- gr.Markdown("## Decode-stage Compression Results")
155
 
156
  decode_results = gr.Dataframe(value=filter_and_display(["Throughput (token/s)", "Quality"], list(data["Model"].unique()), list(data["Dataset"].unique()), "decode"), headers=["Method", "Model", "Throughput (token/s)", "Quality", "Link"])
157
 
158
  # Decode-stage visualization
159
  with gr.Row():
160
- gr.Markdown("### Decode-stage Visualization")
161
- decode_plot = gr.Plot(value=create_decode_visualization(filter_and_display(["Throughput (token/s)"], list(data["Model"].unique()), list(data["Dataset"].unique()), "decode")))
162
 
163
  def auto_update_prefill(selected_columns, model_types, datasets):
164
  if not model_types or not datasets:
 
37
  # Adjust aggregation based on stage
38
  if stage == "decode":
39
  filtered = filtered.groupby(["Method", "Model"], as_index=False).agg({
 
40
  "Quality": "mean",
41
+ "Throughput (token/s)": "mean",
42
  "Link": "first"
43
  })
44
  else:
45
  filtered = filtered.groupby(["Method", "Model"], as_index=False).agg({
46
  "Quality": "mean",
47
+ "TTFT (s)": "mean",
48
  "Link": "first"
49
  })
50
 
 
55
  def create_prefill_visualization(filtered_data):
56
  if filtered_data.empty:
57
  return None
58
+ fig = px.line(filtered_data,
59
+ x='TTFT (s)',
60
+ y='Quality',
61
+ color='Method',
62
+ title='Quality-TTFT trade-off of different methods',
63
+ markers=True
64
+ )
65
+ fig.update_layout(
66
+ xaxis=dict(range=[0.0, 5.0]),
67
+ yaxis=dict(range=[0.0, 40.0])
68
+ )
69
  return fig
70
 
71
  def create_decode_visualization(filtered_data):
72
  if filtered_data.empty:
73
  return None
74
+ fig = px.line(filtered_data,
75
+ x='Throughput (token/s)',
76
+ y='Quality',
77
+ color='Method',
78
+ title='Quality-throughput trade-off of different methods',
79
+ markers=True
80
+ )
81
+ fig.update_layout(
82
+ xaxis=dict(range=[0.0, 1000.0]),
83
+ yaxis=dict(range=[0.0, 1.0])
84
+ )
85
  return fig
86
 
87
  # Load the data from the /data folder
 
93
  with gr.Blocks() as app:
94
  with gr.Row():
95
  gr.Markdown(
96
+ """# KV Cache Arena
97
+ We invite users and developers to explore and compare various KV cache and prompt compression methods across different language models and workloads. Our platform offers interactive filtering options and real-time visualizations, enabling seamless analysis of benchmarking results.
98
+ """)
 
99
 
100
  with gr.Tabs():
101
  with gr.TabItem("KV Cache Benchmark"):
102
  # Prefill-stage selection
103
  with gr.Row():
104
+ gr.Markdown("## Prefill-Stage KV Cache Compression")
105
  with gr.Row():
106
  with gr.Column():
107
+ # gr.Markdown("#### Select Columns to Display")
108
  prefill_columns_to_display = gr.CheckboxGroup(
109
+ choices=["Quality", "TTFT (s)", "Link"],
110
+ label="Metrics",
111
+ value=["Quality", "TTFT (s)"]
112
  )
113
 
114
  with gr.Column():
115
+ # gr.Markdown("#### Model Types")
116
  prefill_model_types = gr.CheckboxGroup(
117
  choices=list(data["Model"].unique()),
118
  label="Model Types",
 
120
  )
121
 
122
  with gr.Column():
123
+ # gr.Markdown("#### Datasets")
124
  prefill_datasets = gr.CheckboxGroup(
125
  choices=list(data[data["Stage"] == "prefill"]["Dataset"].unique()),
126
  label="Datasets",
 
129
 
130
  # Prefill-stage compression results
131
  with gr.Row():
132
+ gr.Markdown("## Results")
133
 
134
+ prefill_results = gr.Dataframe(value=filter_and_display(["Quality", "TTFT (s)"], list(data["Model"].unique()), list(data["Dataset"].unique()), "prefill"), headers=["Method", "Model", "Quality", "TTFT (s)", "Link"])
135
 
136
  # Prefill-stage visualization
137
  with gr.Row():
138
+ # gr.Markdown("### Visualization")
139
+ prefill_plot = gr.Plot(value=create_prefill_visualization(filter_and_display(["Quality", "TTFT (s)"], list(data["Model"].unique()), list(data["Dataset"].unique()), "prefill")))
140
 
141
  # Decode-stage selection
142
  with gr.Row():
143
+ gr.Markdown("## Decode-Stage KV Cache Compression")
144
  with gr.Row():
145
  with gr.Column():
146
+ # gr.Markdown("#### Select Columns to Display")
147
  decode_columns_to_display = gr.CheckboxGroup(
148
  choices=["Throughput (token/s)", "Quality", "Link"],
149
  label="Columns",
 
151
  )
152
 
153
  with gr.Column():
154
+ # gr.Markdown("#### Model Types")
155
  decode_model_types = gr.CheckboxGroup(
156
  choices=list(data["Model"].unique()),
157
  label="Model Types",
 
159
  )
160
 
161
  with gr.Column():
162
+ # gr.Markdown("#### Datasets")
163
  decode_datasets = gr.CheckboxGroup(
164
  choices=list(data[data["Stage"] == "decode"]["Dataset"].unique()),
165
  label="Datasets",
 
168
 
169
  # Decode-stage compression results
170
  with gr.Row():
171
+ gr.Markdown("## Results")
172
 
173
  decode_results = gr.Dataframe(value=filter_and_display(["Throughput (token/s)", "Quality"], list(data["Model"].unique()), list(data["Dataset"].unique()), "decode"), headers=["Method", "Model", "Throughput (token/s)", "Quality", "Link"])
174
 
175
  # Decode-stage visualization
176
  with gr.Row():
177
+ # gr.Markdown("### Visualization")
178
+ decode_plot = gr.Plot(value=create_decode_visualization(filter_and_display(["Quality", "Throughput (token/s)"], list(data["Model"].unique()), list(data["Dataset"].unique()), "decode")))
179
 
180
  def auto_update_prefill(selected_columns, model_types, datasets):
181
  if not model_types or not datasets:
data/{decode_h2o_Llama3.1-8B-Instruct_LongGenBench.json β†’ decode_H2O_Mistral-7B-v0.3_LongGenBench.json} RENAMED
File without changes
data/{decode_streamingLLM_Llama3.1-8B-Instruct_LongGenBench.json β†’ decode_StreamingLLM_Mistral-7B-v0.3_LongGenBench.json} RENAMED
File without changes
data/{decode_vllm_Llama3.1-8B-Instruct_LongGenBench.json β†’ decode_vLLM_Mistral-7B-v0.3_LongGenBench.json} RENAMED
File without changes
data/{prefill_cachegen_Llama3.1-8B-Instruct_NarrativeQA.json β†’ prefill_CacheGen_Mistral-7B-v0.3_NarrativeQA.json} RENAMED
@@ -1,5 +1,5 @@
1
  {
2
  "Quality": 29.53,
3
- "TTFT":2.5,
4
  "Link": "www.google.com"
5
  }
 
1
  {
2
  "Quality": 29.53,
3
+ "TTFT (s)": 2.5,
4
  "Link": "www.google.com"
5
  }
data/{prefill_kivi_Llama3.1-8B-Instruct_NarrativeQA.json β†’ prefill_KIVI_Mistral-7B-v0.3_NarrativeQA.json} RENAMED
@@ -1,5 +1,5 @@
1
  {
2
  "Quality": 27.27,
3
- "TTFT":3.3,
4
  "Link": "www.google.com"
5
  }
 
1
  {
2
  "Quality": 27.27,
3
+ "TTFT (s)": 3.3,
4
  "Link": "www.google.com"
5
  }
data/{prefill_vllm_Llama3.1-8B-Instruct_NarrativeQA.json β†’ prefill_vLLM_Mistral-7B-v0.3_NarrativeQA.json} RENAMED
@@ -1,5 +1,5 @@
1
  {
2
  "Quality": 29.26,
3
- "TTFT":4.8,
4
  "Link": "www.google.com"
5
  }
 
1
  {
2
  "Quality": 29.26,
3
+ "TTFT (s)": 4.8,
4
  "Link": "www.google.com"
5
  }