lmcache-benchmark-lite

Sleeping

App Files Files Community

Alex-q-z commited on Feb 7

Commit

4b73981

1 Parent(s): 200e56a

[Refactor] Clean up the frontend for correct visualization and better readabilitt

Browse files

Files changed (7) hide show

app.py +45 -28
data/{decode_h2o_Llama3.1-8B-Instruct_LongGenBench.json → decode_H2O_Mistral-7B-v0.3_LongGenBench.json} +0 -0
data/{decode_streamingLLM_Llama3.1-8B-Instruct_LongGenBench.json → decode_StreamingLLM_Mistral-7B-v0.3_LongGenBench.json} +0 -0
data/{decode_vllm_Llama3.1-8B-Instruct_LongGenBench.json → decode_vLLM_Mistral-7B-v0.3_LongGenBench.json} +0 -0
data/{prefill_cachegen_Llama3.1-8B-Instruct_NarrativeQA.json → prefill_CacheGen_Mistral-7B-v0.3_NarrativeQA.json} +1 -1
data/{prefill_kivi_Llama3.1-8B-Instruct_NarrativeQA.json → prefill_KIVI_Mistral-7B-v0.3_NarrativeQA.json} +1 -1
data/{prefill_vllm_Llama3.1-8B-Instruct_NarrativeQA.json → prefill_vLLM_Mistral-7B-v0.3_NarrativeQA.json} +1 -1

app.py CHANGED Viewed

@@ -37,14 +37,14 @@ def filter_and_display(selected_columns, model_types, datasets, stage):
         # Adjust aggregation based on stage
         if stage == "decode":
             filtered = filtered.groupby(["Method", "Model"], as_index=False).agg({
-                "Throughput (token/s)": "mean",
                 "Quality": "mean",
                 "Link": "first"
             })
         else:
             filtered = filtered.groupby(["Method", "Model"], as_index=False).agg({
                 "Quality": "mean",
-                "TTFT": "mean",
                 "Link": "first"
             })
@@ -55,15 +55,33 @@ def filter_and_display(selected_columns, model_types, datasets, stage):
 def create_prefill_visualization(filtered_data):
     if filtered_data.empty:
         return None
-    fig = px.bar(filtered_data, x='Model', y='Quality', color='Method', barmode='group',
-                 title='Prefill Stage: Quality by Model and Method')
     return fig
 def create_decode_visualization(filtered_data):
     if filtered_data.empty:
         return None
-    fig = px.bar(filtered_data, x='Model', y='Throughput (token/s)', color='Method', barmode='group',
-                 title='Decode Stage: Throughput by Model and Method')
     return fig
 # Load the data from the /data folder
@@ -75,27 +93,26 @@ def create_gradio_app():
     with gr.Blocks() as app:
         with gr.Row():
             gr.Markdown(
-                """# KV Cache Benchmark
-### Demo leaderboard
-This demo leaderboard allows users to explore and compare different KV cache implementations across various models and datasets. It provides interactive filtering options and real-time updates of benchmark results, including visualization of Quality and TTFT metrics.
-""")
         with gr.Tabs():
             with gr.TabItem("KV Cache Benchmark"):
                 # Prefill-stage selection
                 with gr.Row():
-                    gr.Markdown("## Prefill-stage Selection")
                 with gr.Row():
                     with gr.Column():
-                        gr.Markdown("#### Select Columns to Display")
                         prefill_columns_to_display = gr.CheckboxGroup(
-                            choices=["Quality", "TTFT", "Link"],
-                            label="Columns",
-                            value=["Quality", "TTFT"]
                         )
                     with gr.Column():
-                        gr.Markdown("#### Model Types")
                         prefill_model_types = gr.CheckboxGroup(
                             choices=list(data["Model"].unique()),
                             label="Model Types",
@@ -103,7 +120,7 @@ This demo leaderboard allows users to explore and compare different KV cache imp
                         )
                     with gr.Column():
-                        gr.Markdown("#### Datasets")
                         prefill_datasets = gr.CheckboxGroup(
                             choices=list(data[data["Stage"] == "prefill"]["Dataset"].unique()),
                             label="Datasets",
@@ -112,21 +129,21 @@ This demo leaderboard allows users to explore and compare different KV cache imp
                 # Prefill-stage compression results
                 with gr.Row():
-                    gr.Markdown("## Prefill-stage Compression Results")
-                prefill_results = gr.Dataframe(value=filter_and_display(["Quality", "TTFT"], list(data["Model"].unique()), list(data["Dataset"].unique()), "prefill"), headers=["Method", "Model", "Quality", "TTFT", "Link"])
                 # Prefill-stage visualization
                 with gr.Row():
-                    gr.Markdown("### Prefill-stage Visualization")
-                    prefill_plot = gr.Plot(value=create_prefill_visualization(filter_and_display(["Quality"], list(data["Model"].unique()), list(data["Dataset"].unique()), "prefill")))
                 # Decode-stage selection
                 with gr.Row():
-                    gr.Markdown("## Decode-stage Selection")
                 with gr.Row():
                     with gr.Column():
-                        gr.Markdown("#### Select Columns to Display")
                         decode_columns_to_display = gr.CheckboxGroup(
                             choices=["Throughput (token/s)", "Quality", "Link"],
                             label="Columns",
@@ -134,7 +151,7 @@ This demo leaderboard allows users to explore and compare different KV cache imp
                         )
                     with gr.Column():
-                        gr.Markdown("#### Model Types")
                         decode_model_types = gr.CheckboxGroup(
                             choices=list(data["Model"].unique()),
                             label="Model Types",
@@ -142,7 +159,7 @@ This demo leaderboard allows users to explore and compare different KV cache imp
                         )
                     with gr.Column():
-                        gr.Markdown("#### Datasets")
                         decode_datasets = gr.CheckboxGroup(
                             choices=list(data[data["Stage"] == "decode"]["Dataset"].unique()),
                             label="Datasets",
@@ -151,14 +168,14 @@ This demo leaderboard allows users to explore and compare different KV cache imp
                 # Decode-stage compression results
                 with gr.Row():
-                    gr.Markdown("## Decode-stage Compression Results")
                 decode_results = gr.Dataframe(value=filter_and_display(["Throughput (token/s)", "Quality"], list(data["Model"].unique()), list(data["Dataset"].unique()), "decode"), headers=["Method", "Model", "Throughput (token/s)", "Quality", "Link"])
                 # Decode-stage visualization
                 with gr.Row():
-                    gr.Markdown("### Decode-stage Visualization")
-                    decode_plot = gr.Plot(value=create_decode_visualization(filter_and_display(["Throughput (token/s)"], list(data["Model"].unique()), list(data["Dataset"].unique()), "decode")))
                 def auto_update_prefill(selected_columns, model_types, datasets):
                     if not model_types or not datasets:

         # Adjust aggregation based on stage
         if stage == "decode":
             filtered = filtered.groupby(["Method", "Model"], as_index=False).agg({
                 "Quality": "mean",
+                "Throughput (token/s)": "mean",
                 "Link": "first"
             })
         else:
             filtered = filtered.groupby(["Method", "Model"], as_index=False).agg({
                 "Quality": "mean",
+                "TTFT (s)": "mean",
                 "Link": "first"
             })
 def create_prefill_visualization(filtered_data):
     if filtered_data.empty:
         return None
+    fig = px.line(filtered_data,
+                  x='TTFT (s)',
+                  y='Quality',
+                  color='Method',
+                  title='Quality-TTFT trade-off of different methods',
+                  markers=True
+    )
+    fig.update_layout(
+        xaxis=dict(range=[0.0, 5.0]),
+        yaxis=dict(range=[0.0, 40.0])
+    )
     return fig
 def create_decode_visualization(filtered_data):
     if filtered_data.empty:
         return None
+    fig = px.line(filtered_data,
+                  x='Throughput (token/s)',
+                  y='Quality',
+                  color='Method',
+                  title='Quality-throughput trade-off of different methods',
+                  markers=True
+    )
+    fig.update_layout(
+        xaxis=dict(range=[0.0, 1000.0]),
+        yaxis=dict(range=[0.0, 1.0])
+    )
     return fig
 # Load the data from the /data folder
     with gr.Blocks() as app:
         with gr.Row():
             gr.Markdown(
+                """# KV Cache Arena
+                   We invite users and developers to explore and compare various KV cache and prompt compression methods across different language models and workloads. Our platform offers interactive filtering options and real-time visualizations, enabling seamless analysis of benchmarking results.
+                """)
         with gr.Tabs():
             with gr.TabItem("KV Cache Benchmark"):
                 # Prefill-stage selection
                 with gr.Row():
+                    gr.Markdown("## Prefill-Stage KV Cache Compression")
                 with gr.Row():
                     with gr.Column():
+                        # gr.Markdown("#### Select Columns to Display")
                         prefill_columns_to_display = gr.CheckboxGroup(
+                            choices=["Quality", "TTFT (s)", "Link"],
+                            label="Metrics",
+                            value=["Quality", "TTFT (s)"]
                         )
                     with gr.Column():
+                        # gr.Markdown("#### Model Types")
                         prefill_model_types = gr.CheckboxGroup(
                             choices=list(data["Model"].unique()),
                             label="Model Types",
                         )
                     with gr.Column():
+                        # gr.Markdown("#### Datasets")
                         prefill_datasets = gr.CheckboxGroup(
                             choices=list(data[data["Stage"] == "prefill"]["Dataset"].unique()),
                             label="Datasets",
                 # Prefill-stage compression results
                 with gr.Row():
+                    gr.Markdown("## Results")
+                prefill_results = gr.Dataframe(value=filter_and_display(["Quality", "TTFT (s)"], list(data["Model"].unique()), list(data["Dataset"].unique()), "prefill"), headers=["Method", "Model", "Quality", "TTFT (s)", "Link"])
                 # Prefill-stage visualization
                 with gr.Row():
+                    # gr.Markdown("### Visualization")
+                    prefill_plot = gr.Plot(value=create_prefill_visualization(filter_and_display(["Quality", "TTFT (s)"], list(data["Model"].unique()), list(data["Dataset"].unique()), "prefill")))
                 # Decode-stage selection
                 with gr.Row():
+                    gr.Markdown("## Decode-Stage KV Cache Compression")
                 with gr.Row():
                     with gr.Column():
+                        # gr.Markdown("#### Select Columns to Display")
                         decode_columns_to_display = gr.CheckboxGroup(
                             choices=["Throughput (token/s)", "Quality", "Link"],
                             label="Columns",
                         )
                     with gr.Column():
+                        # gr.Markdown("#### Model Types")
                         decode_model_types = gr.CheckboxGroup(
                             choices=list(data["Model"].unique()),
                             label="Model Types",
                         )
                     with gr.Column():
+                        # gr.Markdown("#### Datasets")
                         decode_datasets = gr.CheckboxGroup(
                             choices=list(data[data["Stage"] == "decode"]["Dataset"].unique()),
                             label="Datasets",
                 # Decode-stage compression results
                 with gr.Row():
+                    gr.Markdown("## Results")
                 decode_results = gr.Dataframe(value=filter_and_display(["Throughput (token/s)", "Quality"], list(data["Model"].unique()), list(data["Dataset"].unique()), "decode"), headers=["Method", "Model", "Throughput (token/s)", "Quality", "Link"])
                 # Decode-stage visualization
                 with gr.Row():
+                    # gr.Markdown("### Visualization")
+                    decode_plot = gr.Plot(value=create_decode_visualization(filter_and_display(["Quality", "Throughput (token/s)"], list(data["Model"].unique()), list(data["Dataset"].unique()), "decode")))
                 def auto_update_prefill(selected_columns, model_types, datasets):
                     if not model_types or not datasets:

data/{decode_h2o_Llama3.1-8B-Instruct_LongGenBench.json → decode_H2O_Mistral-7B-v0.3_LongGenBench.json} RENAMED Viewed

File without changes

data/{decode_streamingLLM_Llama3.1-8B-Instruct_LongGenBench.json → decode_StreamingLLM_Mistral-7B-v0.3_LongGenBench.json} RENAMED Viewed

File without changes

data/{decode_vllm_Llama3.1-8B-Instruct_LongGenBench.json → decode_vLLM_Mistral-7B-v0.3_LongGenBench.json} RENAMED Viewed

File without changes

data/{prefill_cachegen_Llama3.1-8B-Instruct_NarrativeQA.json → prefill_CacheGen_Mistral-7B-v0.3_NarrativeQA.json} RENAMED Viewed

@@ -1,5 +1,5 @@
 {
     "Quality": 29.53,
-    "TTFT":2.5,
     "Link": "www.google.com"
 }

 {
     "Quality": 29.53,
+    "TTFT (s)": 2.5,
     "Link": "www.google.com"
 }

data/{prefill_kivi_Llama3.1-8B-Instruct_NarrativeQA.json → prefill_KIVI_Mistral-7B-v0.3_NarrativeQA.json} RENAMED Viewed

@@ -1,5 +1,5 @@
 {
     "Quality": 27.27,
-    "TTFT":3.3,
     "Link": "www.google.com"
 }

 {
     "Quality": 27.27,
+    "TTFT (s)": 3.3,
     "Link": "www.google.com"
 }

data/{prefill_vllm_Llama3.1-8B-Instruct_NarrativeQA.json → prefill_vLLM_Mistral-7B-v0.3_NarrativeQA.json} RENAMED Viewed

@@ -1,5 +1,5 @@
 {
     "Quality": 29.26,
-    "TTFT":4.8,
     "Link": "www.google.com"
 }

 {
     "Quality": 29.26,
+    "TTFT (s)": 4.8,
     "Link": "www.google.com"
 }