Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| # ai4code_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AI4Code") | |
| # amps_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AMPS") | |
| # apache_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/ASFPublicMail") | |
| # books3_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Books3") | |
| # cp_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/CPDataset") | |
| # dmmath_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/DMMath") | |
| # discourse_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Discourse") | |
| # wiki_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Enwiki") | |
| # euro_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/EuroParliamentProceedings") | |
| # freelaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/FreeLaw_Options") | |
| # ghdiffs_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubDiff") | |
| # ghissues_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubIssues") | |
| # gutenberg_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Gutenberg") | |
| # leet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/LeetCode") | |
| # pileoflaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PileOfLaw") | |
| # pubmed_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PubMed") | |
| # s2orc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/S2ORC") | |
| # se_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/StackExchange") | |
| # usenet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USENET") | |
| # uspto_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USPTO") | |
| # ubuntuirc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/UbuntuIRC") | |
| # arxiv_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/arXiv") | |
| dataset_data = { | |
| "AI4Code": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "AMPS": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "ASFPublicMail": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "Books3": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "CPDataset": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "DMMath": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "Discourse": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "Enwiki": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "EuroParliamentProceedings": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "FreeLaw_Options": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "GitHubDiff": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "GitHubIssues": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "Gutenberg": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "LeetCode": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "PileOfLaw": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "PubMed": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "S2ORC": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "StackExchange": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "USENET": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "USPTO": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "UbuntuIRC": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| "arXiv": { | |
| # create fake data for the different ratios | |
| "word_rep_ratios": np.random.randn(1000), | |
| "char_rep_ratios": np.random.randn(1000), | |
| "flagged_word_ratios": np.random.randn(1000), | |
| "num_words": np.random.randint(0, 1000, 1000), | |
| }, | |
| } | |
| def plt_plot(threshold, x): | |
| # prepare some data for a histogram | |
| # x = np.random.randn(1000) | |
| # create a figure | |
| fig = plt.figure() | |
| # add a subplot | |
| ax = fig.add_subplot(111) | |
| # plot some data | |
| ax.hist(x, bins=50) | |
| # plot red dashed line at threshold | |
| ax.axvline(threshold, color='r', linestyle='dashed', linewidth=2) | |
| plt.title("Histogram of random data") | |
| plt.xlabel("Value") | |
| plt.ylabel("Frequency") | |
| return fig | |
| # x = ["Math", "Business", "Statistics", "IT", "Commerce"] | |
| # y = [68, 73, 82, 74, 85] | |
| # # create a new plot | |
| # plt.rcParams['figure.figsize'] = 6,4 | |
| # fig = plt.figure() | |
| # ax = fig.add_axes([0,0,1,1]) | |
| # ax.bar(x, y) | |
| # plot red dashed line at threshold | |
| # plt.axhline(y=threshold, color='r', linestyle='--') | |
| # plt.title("Marks per subject") | |
| # plt.xlabel("Subject") | |
| # plt.ylabel("Score") | |
| # return fig | |
| with gr.Blocks() as demo: | |
| dataset = gr.Radio(list(dataset_data.keys()), label="Dataset") | |
| with gr.Tab("Character Repetition Ratio"): | |
| # plot some random data | |
| plot = gr.Plot() | |
| threshold = gr.Slider(minimum=0, maximum=100, label="Threshold") | |
| calculate = gr.Button("Calculate") | |
| calculate.click(plt_plot, [threshold, dataset_data[dataset].char_rep_ratios], plot) | |
| with gr.Tab("Word Repetition Ratio"):# plot some random data | |
| plot = gr.Plot() | |
| threshold = gr.Slider(minimum=0, maximum=1, label="Threshold") | |
| calculate = gr.Button("Calculate") | |
| calculate.click(plt_plot, [threshold, dataset_data[dataset].word_rep_ratios], plot) | |
| with gr.Tab("Flagged Word Ratio"):# plot some random data | |
| plot = gr.Plot() | |
| threshold = gr.Slider(minimum=0, maximum=1, label="Threshold") | |
| calculate = gr.Button("Calculate") | |
| calculate.click(plt_plot, [threshold, dataset_data[dataset].flagged_word_ratios], plot) | |
| if __name__ == "__main__": | |
| demo.launch(share=True) |