kernelmachine commited on
Commit
842e849
1 Parent(s): 893dac4

added model

Browse files
app.py CHANGED
@@ -1,7 +1,72 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import argparse
3
+ import json
4
+ import logging
5
+ import os
6
+ import sys
7
+ import pathlib
8
+ import random
9
+ import shutil
10
+ import time
11
+ from typing import Any, Dict, List, Union
12
 
13
+ import numpy as np
14
+ import pandas as pd
15
+ from sklearn.feature_extraction.text import (CountVectorizer, TfidfTransformer, HashingVectorizer,
16
+ TfidfVectorizer)
17
+ from sklearn.linear_model import LogisticRegression
18
+ from sklearn.metrics import f1_score
19
+ from sklearn.model_selection import train_test_split
20
+ from tqdm import tqdm
21
+ from lr.hyperparameters import SEARCH_SPACE, RandomSearch, HyperparameterSearch
22
+ from shutil import rmtree
23
 
24
+
25
+ def load_model(serialization_dir):
26
+ with open(os.path.join(serialization_dir, "best_hyperparameters.json"), 'r') as f:
27
+ hyperparameters = json.load(f)
28
+ if hyperparameters.pop('stopwords') == 1:
29
+ stop_words = 'english'
30
+ else:
31
+ stop_words = None
32
+ weight = hyperparameters.pop('weight')
33
+ if weight == 'binary':
34
+ binary = True
35
+ else:
36
+ binary = False
37
+ ngram_range = hyperparameters.pop('ngram_range')
38
+ ngram_range = sorted([int(x) for x in ngram_range.split()])
39
+ if weight == 'tf-idf':
40
+ vect = TfidfVectorizer(stop_words=stop_words,
41
+ lowercase=True,
42
+ ngram_range=ngram_range)
43
+ elif weight == 'hash':
44
+ vect = HashingVectorizer(stop_words=stop_words,lowercase=True,ngram_range=ngram_range)
45
+ else:
46
+ vect = CountVectorizer(binary=binary,
47
+ stop_words=stop_words,
48
+ lowercase=True,
49
+ ngram_range=ngram_range)
50
+ if weight != "hash":
51
+ with open(os.path.join(serialization_dir, "vocab.json"), 'r') as f:
52
+ vocab = json.load(f)
53
+ vect.vocabulary_ = vocab
54
+ hyperparameters['C'] = float(hyperparameters['C'])
55
+ hyperparameters['tol'] = float(hyperparameters['tol'])
56
+ classifier = LogisticRegression(**hyperparameters)
57
+ if os.path.exists(os.path.join(serialization_dir, "archive", "idf.npy")):
58
+ vect.idf_ = np.load(os.path.join(serialization_dir, "archive", "idf.npy"))
59
+ classifier.coef_ = np.load(os.path.join(serialization_dir, "archive", "coef.npy"))
60
+ classifier.intercept_ = np.load(os.path.join(serialization_dir, "archive", "intercept.npy"))
61
+ classifier.classes_ = np.load(os.path.join(serialization_dir, "archive", "classes.npy"))
62
+ return classifier, vect
63
+
64
+ def score(x, clf, vectorizer):
65
+ # score a single document
66
+ return clf.predict_proba(vectorizer.transform([x]))
67
+
68
+ clf, vectorizer = load_model("model/")
69
+
70
+ def start(text):
71
+ k = round(score(text, clf, vectorizer)[0][1], 2)
72
+ return {"GPT-3 Filter Quality Score": k }
model/archive/classes.npy ADDED
Binary file (144 Bytes). View file
 
model/archive/coef.npy ADDED
Binary file (8.39 MB). View file
 
model/archive/intercept.npy ADDED
Binary file (136 Bytes). View file
 
model/best_hyperparameters.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"C": 0.977778, "multi_class": "auto", "ngram_range": "1 2", "penalty": "l1", "random_state": 44555, "solver": "liblinear", "stopwords": null, "tol": 0.000816, "weight": "hash"}
model/results.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"C":0.977778,"dev_accuracy":0.9332787322,"dev_f1":0.9226169818,"multi_class":"auto","ngram_range":"[1, 2]","penalty":"l1","random_state":44555,"solver":"liblinear","stopwords":null,"tol":0.000816,"training_duration":807.7028501034,"weight":"hash"}
score.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tqdm.auto import tqdm
2
+ import numpy as np
3
+
4
+
5
+ def score_text(df, clf, clf_vectorizer, field='text'):
6
+ ## score text using quality filter
7
+ df['filter_output'] = clf.predict_proba(clf_vectorizer.transform(tqdm(df[field]))).tolist()
8
+ df['prob_low_quality'] = df.filter_output.apply(lambda x: x[0])
9
+ df['prob_high_quality'] = df.filter_output.apply(lambda x: x[1])
10
+ df = df.drop(['filter_output'], axis=1)
11
+ df['GPT3_included'] = df.prob_high_quality.apply(lambda x: np.random.pareto(9) > (1 - x))
12
+
13
+ return df
14
+
15
+ def get_counts(df, field='text'):
16
+ # count number of whitespace tokens
17
+ tqdm.pandas()
18
+ df['num_tokens'] = df[field].progress_apply(lambda x: len(x.split()))
19
+ return df
20
+
21
+ def score(x, clf, vectorizer):
22
+ # score a single document
23
+ return clf.predict_proba(vectorizer.transform([x]))