File size: 4,293 Bytes
27e0148
 
 
 
 
ad4860f
cd29aa5
c05248f
8222ac1
27e0148
 
 
a970ae0
 
 
 
27e0148
fbc19a1
8222ac1
 
 
e543e92
8222ac1
e543e92
c05248f
3f1630b
8222ac1
 
 
 
27e0148
 
 
 
 
 
 
 
 
 
8e28628
10c1114
32490fd
 
10c1114
 
 
 
27e0148
32490fd
27e0148
 
140f5d3
27e0148
 
 
f05ca9d
27e0148
639f8ea
27e0148
 
 
 
49e03fb
27e0148
 
 
c824141
27e0148
8e28628
3f1630b
8e28628
10c1114
32490fd
 
10c1114
 
 
457910c
d34f941
c824141
ad4860f
 
 
 
 
 
 
32490fd
 
ad4860f
 
 
27e0148
 
 
8f57a99
cd29aa5
 
 
 
f22f3ec
cd29aa5
 
8f57a99
 
47f2421
 
 
 
 
8f57a99
 
 
 
 
a13202f
8f57a99
 
32490fd
a127a18
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import duckdb
import gradio as gr
from httpx import Client
from huggingface_hub import HfApi
import pandas as pd
from gradio_huggingfacehub_search import HuggingfaceHubSearch
import spaces
from llama_cpp import Llama


BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
headers = {
	"Accept" : "application/json",
	"Content-Type": "application/json" 
}
client = Client(headers=headers)
api = HfApi()
llama = Llama(
        model_path="DuckDB-NSQL-7B-v0.1-q8_0.gguf",
        n_ctx=2048,
        n_gpu_layers=50
    )

@spaces.GPU
def generate_sql(prompt):
    # pred = pipe(prompt, max_length=1000)
    # return pred[0]["generated_text"]
    pred = llama(prompt, temperature=0.1, max_tokens=1000)
    return pred["choices"][0]["text"]

def get_first_parquet(dataset: str):
    resp = client.get(f"{BASE_DATASETS_SERVER_URL}/parquet?dataset={dataset}")
    return resp.json()["parquet_files"][0]


def text2sql(dataset_name, query_input):
    print(f"start text2sql for {dataset_name}")
    try:
        first_parquet = get_first_parquet(dataset_name)
    except Exception as error:
        return {
            schema_output: "",
            prompt_output: "",
            query_output: "",
            df:pd.DataFrame([{"error": f"❌ Could not get dataset schema. {error=}"}])
        }

    first_parquet_url = first_parquet["url"]
    print(f"getting schema from {first_parquet_url}")
    con = duckdb.connect()
    con.execute("INSTALL 'httpfs'; LOAD httpfs;")
    # could get from Parquet instead?
    con.execute(f"CREATE TABLE data as SELECT * FROM '{first_parquet_url}' LIMIT 1;")
    result = con.sql("SELECT sql FROM duckdb_tables() where table_name ='data';").df()
    ddl_create = result.iloc[0,0]
    
    text = f"""### Instruction:
    Your task is to generate valid duckdb SQL to answer the following question.

    ### Input:
    Here is the database schema that the SQL query will run on:
    {ddl_create}
    
    ### Question:
    {query_input}

    ### Response (use duckdb shorthand if possible):
    """
    try:
        sql_output = generate_sql(text)
    except Exception as error:
        return {
            schema_output: ddl_create,
            prompt_output: text,
            query_output: "",
            df:pd.DataFrame([{"error": f"❌ Unable to get the SQL query based on the text. {error=}"}])
        }

    # Should be replaced by the prompt but not working
    sql_output = sql_output.replace("FROM data", f"FROM '{first_parquet_url}'")
    try:
        query_result = con.sql(sql_output).df()
    except Exception as error:
        query_result = pd.DataFrame([{"error": f"❌ Could not execute SQL query {error=}"}])
    finally:
        con.close()
    return {
        schema_output: ddl_create,
        prompt_output: text,
        query_output:sql_output,
        df:query_result
    }


with gr.Blocks() as demo:
    gr.Markdown("# πŸ’« Generate SQL queries based on a given text for your Hugging Face Dataset πŸ’«")
    dataset_name = HuggingfaceHubSearch(
            label="Hub Dataset ID",
            placeholder="Search for dataset id on Huggingface",
            search_type="dataset",
            value="jamescalam/world-cities-geo",
        )
    # dataset_name = gr.Textbox("jamescalam/world-cities-geo", label="Dataset Name")
    query_input = gr.Textbox("Cities from Albania country", label="Ask something about your data")
    examples = [
                ["Cities from Albania country"],
                ["The continent with the most number of countries"],
                ["Cities that start with 'A'"],
                ["Cities by region"],
            ]
    gr.Examples(examples=examples, inputs=[query_input],outputs=[])
    btn = gr.Button("Generate SQL")
    query_output = gr.Textbox(label="Output SQL", interactive= False)    
    df = gr.DataFrame(datatype="markdown")
    with gr.Accordion("Open for prompt details", open=False):
        #with gr.Column(scale=1, min_width=600):
        schema_output = gr.Textbox(label="Parquet Schema as CREATE DDL", interactive= False)
        prompt_output = gr.Textbox(label="Generated prompt", interactive= False)
    btn.click(text2sql, inputs=[dataset_name, query_input], outputs=[schema_output, prompt_output, query_output,df])
demo.launch(debug=True)