File size: 2,800 Bytes
9966b88
 
 
 
98cc895
 
 
 
 
f555c09
9c53030
 
 
 
 
 
 
 
 
7d8479e
 
 
 
 
 
9c53030
98cc895
 
 
9966b88
 
98cc895
 
 
9c53030
0bbc8ff
98cc895
9c53030
0bbc8ff
 
 
 
9c53030
 
0bbc8ff
 
 
 
 
 
 
9c53030
 
 
0bbc8ff
98cc895
 
9c53030
 
 
 
 
0bbc8ff
9c53030
98cc895
 
 
9c53030
 
 
 
 
98cc895
9c53030
 
 
 
 
 
0bbc8ff
 
 
 
 
 
 
 
 
ee5fabd
0bbc8ff
 
ee5fabd
 
 
 
0bbc8ff
9c53030
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")

import pandas as pd
import gradio as gr

from transformers import pipeline
from gradio.themes.utils.colors import red, green

detector = pipeline(task='text-classification', model='SJTU-CL/RoBERTa-large-ArguGPT-sent')

color_map = {
    '0%': green.c400,
    '10%': green.c300,
    '20%': green.c200,
    '30%': green.c100,
    '40%': green.c50,
    '50%': red.c50,
    '60%': red.c100,
    '70%': red.c200,
    '80%': red.c300,
    '90%': red.c400,
    '100%': red.c500
}


def predict_doc(doc):
    # sents = sent_tokenize(doc)
    sents = [s.text for s in nlp(doc).sents]
    data = {'sentence': [], 'label': [], 'score': []}
    res = []
    for sent in sents:
        prob = predict_one_sent(sent)

        data['sentence'].append(sent)
        data['score'].append(round(prob, 4))
        if prob <= 0.5:
            data['label'].append('Human')
        else: data['label'].append('Machine')

        if prob < 0.1: label = '0%'
        elif prob < 0.2: label = '10%'
        elif prob < 0.3: label = '20%'
        elif prob < 0.4: label = '30%'
        elif prob < 0.5: label = '40%'
        elif prob < 0.6: label = '50%'
        elif prob < 0.7: label = '60%'
        elif prob < 0.8: label = '70%'
        elif prob < 0.9: label = '80%'
        elif prob < 1: label = '90%'
        else: label = '100%'
        res.append((sent, label))

    df = pd.DataFrame(data)
    df.to_csv('result.csv')
    overall_score = df.score.mean()
    sum_str = ''
    if overall_score <= 0.5: overall_label = 'Human'
    else: overall_label = 'Machine'
    sum_str = f'The essay is probably written by {overall_label}. The probability of being generated by AI is {overall_score}'

    return sum_str, res, df, 'result.csv'


def predict_one_sent(sent):
    '''
    convert to prob 
    LABEL_1, 0.66 -> 0.66
    LABEL_0, 0.66 -> 0.34
    '''
    res = detector(sent)[0]
    org_label, prob = res['label'], res['score']
    if org_label == 'LABEL_0': prob = 1 - prob
    return prob


with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            text_in = gr.Textbox(
                lines=5, 
                label='Essay input', 
                info='Please enter the essay in the textbox'
            )
            btn = gr.Button('Predict who writes this essay!')

        sent_res = gr.HighlightedText(label='Labeled Result', color_map=color_map)
    
    with gr.Row():
        summary = gr.Text(label='Result summary')
        csv_f = gr.File(label='CSV file storing data with all sentences.')

    tab = gr.Dataframe(label='Table with Probability Score', row_count=100)
    btn.click(predict_doc, inputs=[text_in], outputs=[summary, sent_res, tab, csv_f], api_name='predict_doc')

demo.launch()