File size: 13,794 Bytes
51e92f1
 
d579da7
dba9849
 
d579da7
a187ee0
 
128eed0
712c7e8
 
a187ee0
7b630be
f4c78ac
b8b18f8
a187ee0
b8b18f8
712c7e8
b8b18f8
 
a187ee0
b8b18f8
a187ee0
b8b18f8
 
 
088641a
 
a187ee0
088641a
 
a72c0b8
 
a187ee0
088641a
 
a187ee0
 
088641a
 
b8b18f8
 
 
 
088641a
51e92f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb1f36f
51e92f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cae3f7
51e92f1
 
 
 
 
 
 
 
e93c045
51e92f1
 
 
e93c045
51e92f1
e93c045
 
51e92f1
b37aed0
 
 
51e92f1
 
 
 
 
b37aed0
51e92f1
0cae3f7
51e92f1
 
 
 
 
 
 
3a6bfc8
51e92f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cae3f7
51e92f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cae3f7
51e92f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516eb9f
 
 
 
51e92f1
 
 
 
 
 
 
52a48fb
 
516eb9f
 
0cae3f7
516eb9f
 
 
51e92f1
 
 
715fb65
 
34d673d
 
 
 
 
 
 
6ea9021
34d673d
 
 
a07bcdb
dc51565
4bb549d
0cae3f7
51e92f1
0cae3f7
 
 
51e92f1
 
0cae3f7
51e92f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
739811b
8c15771
e3cfd2a
51e92f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e08702
0cae3f7
51e92f1
8e08702
51e92f1
 
 
 
8aef4cf
51e92f1
 
8aef4cf
51e92f1
 
 
 
 
 
 
 
 
057b7fe
 
51e92f1
 
 
057b7fe
51e92f1
 
 
057b7fe
51e92f1
057b7fe
51e92f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cae3f7
393b1e1
51e92f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4dd6cee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437


import os
import subprocess
import sys

def install_with_subprocess(args):
    subprocess.run(args, check=True)

def remove_tensorflow():
    subprocess.run([sys.executable, '-m', 'pip', 'uninstall', '-y', 'tensorflow'], check=True)
    
print("CURRENT WORKING DIRECTORY:",os.getcwd())
print("CURRENT WORKING DIRECTORY LIST:",os.listdir(os.getcwd()))
setup_flag = 'setup_complete.flag'

if not os.path.exists(setup_flag):
    remove_tensorflow()
    os.system('wget https://indic-asr-public.objectstore.e2enetworks.net/ai4b_nemo.zip')
    os.system('unzip -q ai4b_nemo.zip')
    
    os.chdir('NeMo')
    install_with_subprocess([sys.executable, '-m', 'pip', 'install', '-e', '.'])
    os.system('bash reinstall.sh')
    os.chdir('..')

    os.system('git clone -q https://github.com/VarunGumma/IndicTransTokenizer')
    os.chdir('IndicTransTokenizer')
    install_with_subprocess([sys.executable,'-m','pip','install','-q','--editable','./'])
    os.chdir('..')

    subprocess.run(['apt-get', 'update'], check=True)
    subprocess.run(['apt-get', 'install', '-y', 'libsndfile1-dev', 'ffmpeg'], check=True)
    
    os.system('git clone https://github.com/gokulkarthik/TTS')
    os.chdir('TTS')
    install_with_subprocess(['pip3', 'install', '-e', '.[all]'])
    install_with_subprocess(['pip3', 'install', '-r', 'requirements.txt'])
    os.chdir('..')
    
    with open(setup_flag, 'w') as f:
        f.write('Setup complete')

    os.execv(sys.executable, ['python'] + sys.argv)
    

import gradio as gr
from torch import cuda, inference_mode
import nemo.collections.asr as nemo_asr
from IndicTransTokenizer import IndicProcessor
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


DEVICE = "cuda" if cuda.is_available() else "cpu"

print(f"Using device: {DEVICE}")


import os
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader

"""### Load and convert PDF data into vectorDB"""

pm_kisan_doc = "PM-KISANOperationalGuidelines(English).pdf"

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=100
)

loader = PyPDFLoader(pm_kisan_doc)
pages = loader.load_and_split(text_splitter=text_splitter)

pages_chunks = [page.page_content for page in pages]
print(f"Generated {len(pages_chunks)} chunks of {pm_kisan_doc}")

# pages_chunks[8]

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

faiss = FAISS.from_texts(pages_chunks, embeddings)

"""### Querying the vectorDB"""

# Test query
# result = faiss.similarity_search("what are the benefits of PM kisan yojna", k=3)

# This returns the most relevant doc similar to the query

# print(result[0].page_content)

# Result_with_score = faiss.similarity_search_with_score("what are the benefits of PM kisan yojna", k=3)
# Result_with_score[0]



gemini_api_key = os.getenv('GEMINI_API_KEY')

import google.generativeai as genai

def get_gemini_output(prompt, temperature=0.6):

    genai.configure(api_key= gemini_api_key)
    model =  genai.GenerativeModel(model_name='gemini-pro')
    answer = model.generate_content(prompt)

    return answer.text

"""## Build an end-to-end RAG powered Voice Assistant
"""

ip = IndicProcessor(inference=True)


# Commented out IPython magic to ensure Python compatibility.
# # %%capture
 
en2indic_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
en2indic_model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)


# Commented out IPython magic to ensure Python compatibility.
# # %%capture
 
indic2en_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-indic-en-dist-200M", trust_remote_code=True)
indic2en_model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-indic-en-dist-200M", trust_remote_code=True)


model_tokenizer_config = {
    "en2indic": {
        "tokenizer": en2indic_tokenizer,
        "model": en2indic_model,
    },
    "indic2en": {
        "tokenizer": indic2en_tokenizer,
        "model": indic2en_model,
    }
}

def indic_translate(src_lang: str, tgt_lang: str, sents_to_translate: list):

  lang_map = {
    "punjabi": "pan_Guru",
    "bengali": "ben_Beng",
    "malayalam": "mal_Mlym",
    "marathi": "mar_Deva",
    "tamil": "tam_Taml",
    "gujarati": "guj_Gujr",
    "telugu": "tel_Telu",
    "hindi": "hin_Deva",
    "kannada": "kan_Knda",
    "odia": "ory_Orya",
    "english": "eng_Latn"
    }

  src_lang = lang_map[src_lang]
  tgt_lang = lang_map[tgt_lang]

  if src_lang == "eng_Latn":
    tokenizer = model_tokenizer_config["en2indic"]["tokenizer"]
    model = model_tokenizer_config["en2indic"]["model"]

    print(f"Using en2indic, src_lang: {src_lang}, tgt_lang: {tgt_lang}")

  else:
    tokenizer = model_tokenizer_config["indic2en"]["tokenizer"]
    model = model_tokenizer_config["indic2en"]["model"]

    print(f"Using indic2en, src_lang: {src_lang}, tgt_lang: {tgt_lang}")


  batch = ip.preprocess_batch(sents_to_translate, src_lang=src_lang, tgt_lang=tgt_lang, show_progress_bar=False)
  batch = tokenizer(batch, padding="longest", truncation=True, max_length=256, return_tensors="pt")

  with inference_mode():
      print("Generating...")
      outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)

  with tokenizer.as_target_tokenizer():
      outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)

  if tgt_lang != "en_Latn":
    print(f"Postprocessing for {tgt_lang}")
    outputs = ip.postprocess_batch(outputs, lang=tgt_lang)


  return outputs

def download_ai4b_tts_model(lang: str):

  lang_map = {
      "odia": "or",
      "hindi": "hi",
      "tamil": "ta",
      "telugu": "te",
      "punjabi": "pa",
      "kannada": "kn",
      "bengali": "bn",
      "marathi": "mr",
      "gujarati": "gu",
      "malayalam": "ml",
  }

  selected_lang = lang_map[lang]

  download_path = f"/{selected_lang}.zip"

  if os.path.exists(download_path):
    print(f"IndicTTS Model for {lang} already exists.")

def run_tts(text, tts_lang):

  lang_map = {
      "odia": "or",
      "hindi": "hi",
      "tamil": "ta",
      "telugu": "te",
      "punjabi": "pa",
      "kannada": "kn",
      "bengali": "bn",
      "marathi": "mr",
      "gujarati": "gu",
      "malayalam": "ml",
  }

  download_ai4b_tts_model(lang=tts_lang)

  tts_lang = lang_map[tts_lang]
  print(f"Lang code: {tts_lang}")

  print("TTS WORKING DIRECTORY:",os.getcwd())
  tts_command = f'python3 -m TTS.bin.synthesize --text "{text}" \
    --model_path /models/v1/{tts_lang}/fastpitch/best_model.pth \
    --config_path /models/v1/{tts_lang}/fastpitch/config.json \
    --vocoder_path /models/v1/{tts_lang}/hifigan/best_model.pth \
    --vocoder_config_path /models/v1/{tts_lang}/hifigan/config.json \
    --speakers_file_path /models/v1/{tts_lang}/fastpitch/speakers.pth \
    --out_path /tts_output.wav \
    --speaker_idx male'

  if DEVICE == "cuda":
    tts_command += " --use_cuda True"
    print(f"Running IndicTTS on GPU")

  else:
    print(f"Running IndicTTS on CPU")

  os.system(tts_command)

os.makedirs('asr_models', exist_ok=True)
def download_ai4b_asr_model(lang: str):

  available_langs = {
      "odia": "or",
      "hindi": "hi",
      "tamil": "ta",
      "telugu": "te",
      "punjabi": "pa",
      "kannada": "kn",
      "bengali": "bn",
      "marathi": "mr",
      "gujarati": "gu",
      "malayalam": "ml",
  }

  os.makedirs('asr_models', exist_ok=True)
  # download_path = f"/asr_models/ai4b_indicConformer_{available_langs[lang]}.nemo"
  download_dir = "asr_models"
  download_path = os.path.join(download_dir,f"ai4b_indicConformer_{available_langs[lang]}.nemo")
  print(f"Downloaded ASR model path: {download_path}")

  if os.path.exists(download_path):
      print(f"Model for {lang} already exists.")

  elif lang not in available_langs:
      raise ValueError(f"Invalid language code: {lang}")
  else:
      url = f"https://objectstore.e2enetworks.net/indic-asr-public/indicConformer/ai4b_indicConformer_{available_langs[lang]}.nemo"
      try:
          result= subprocess.run(['wget', url, '-O', download_path], check=True, capture_output=True, text=True)
          print("ASR MODEL DOWNLOADED SUCCESSFULLY",result.stdout)
      except subprocess.CallProcessError as e:
          print(f"Error occured: {e.stderr}")
          raise

  return download_path

os.makedirs('asr_models', exist_ok=True)

def ensure_numpy_version():
    try:
        import numpy as np
        required_version = '1.23'
        if np.__version__ != required_version:
            subprocess.run(['pip', 'install', f'numpy=={required_version}'], check=True)
    except ImportError:
        subprocess.run(['pip', 'install', 'numpy==1.21.0'], check=True)

ensure_numpy_version()

subprocess.run(['pip', 'install', 'numba==0.60.0'], check=True)


# import librosa

# def preprocess_audio(audio_path):
#     audio,sr = librosa.load(audio_path,sr=None,mono=True)
#     return audio, sr

def transcribe(audio: str, lang: str):
    # audio, sr = preprocess_audio(audio)

    lang_map = {
      "odia": "or",
      "hindi": "hi",
      "tamil": "ta",
      "telugu": "te",
      "punjabi": "pa",
      "kannada": "kn",
      "bengali": "bn",
      "marathi": "mr",
      "gujarati": "gu",
      "malayalam": "ml",
    }

    download_path = download_ai4b_asr_model(lang=lang)

    asr_model = nemo_asr.models.ASRModel.restore_from(
          download_path, map_location=DEVICE
    )

    # transcription = asr_model.transcribe(audio, batch_size=1, language_id=lang_map[lang])[0][0]
    # transcription = asr_model.transcribe(paths2audio_files=[audio], batch_size=1)[0]
    transcription = asr_model.transcribe([audio], batch_size=1, language_id=lang_map[lang])[0][0]
    print(f"Transcription: {transcription}")

    return transcription

def query_vector_db(query):
  # Combine the top-3 similar documents from the vectorDB
  result = " ".join([result.page_content for result in faiss.similarity_search(query, k=3)])

  return result

from langchain_core.prompts import PromptTemplate

def process_user_query(user_query, retrieved_doc):

  prompt_template = PromptTemplate.from_template(
    "You are a chatbot , which provides information to user based on their queries, \
    the user asks: {user_query}, The information from the related query is: {retrieved_doc}. \
    Now give the output based on the query and relevant information that i provided, written in a structured, well-formatted and concise way. \
    The length of the output should be no more than 70 words, must be in 5 lines."
  )

  prompt = prompt_template.format(user_query=user_query, retrieved_doc=retrieved_doc)
  print("Input prompt:", prompt)
  
  processed_doc = get_gemini_output(prompt)
  print("Output prompt:",processed_doc)

  return processed_doc




def process_gradio_input(audio, user_lang):
  
    # Use IndicASR to transcribe the input audio
    print(f"Transcribing...")
    query_transcription = transcribe(audio, lang=user_lang)

    # Convert the Indic text from transcription to English, so that GPT-3.5 can process it
    print(f"Translating indic to en..")
    indic_to_en = indic_translate(src_lang=user_lang, tgt_lang="english", sents_to_translate=[query_transcription])[0]


    # context = context_manager.get_context()
    # contexulized_query = f"Previous context: {context} \n\nCurrent query: {indic_to_en}"

    # Query the Vector DB to get the relevant document from the query
    print(f"Querying vector db")
    retrieved_doc = query_vector_db(indic_to_en)

    # Extract relevant information from the retrieved document
    print(f"Processing user query")
    processed_doc = process_user_query(user_query=indic_to_en, retrieved_doc=retrieved_doc)

    # context_manager.add_interaction(indic_to_en, processed_doc)

    # Break the document into chunks for faster batch processing
    print(f"Breaking document into chunks..")
    processed_doc_chunks = processed_doc.strip().split(". ")
    processed_doc_chunks = [f"{chunk}." for chunk in processed_doc_chunks if chunk != ""]

    # Translate the the extracted information back to Indic language
    print(f"Translating en to indic..")
    en_to_indic_chunks = indic_translate(src_lang="english", tgt_lang=user_lang, sents_to_translate=processed_doc_chunks)
    en_to_indic_doc = " ".join(en_to_indic_chunks)
    print(f"en_to_indic_doc: {en_to_indic_doc}")

    # Run IndicTTS to generate audio
    print(f"Running TTS to generate audio..")
    run_tts(text=en_to_indic_doc, tts_lang=user_lang)
    print("Finished running TTS")

    print("PRESENT WORKING DIRECTORY OF AUDIO SAVED:", os.getcwd())
    audio_outfile_path = "tts_output.wav"


    return en_to_indic_doc, audio_outfile_path


def launch_gradio_app(show_log=False):

  languages = ["hindi", "odia", "tamil", "telugu", "punjabi", "kannada", "bengali", "marathi", "gujarati", "malayalam"]

  iface = gr.Interface(
      fn=process_gradio_input,
      inputs=[
          gr.Audio(sources=['upload', 'microphone'], type="filepath", show_download_button=True),  # Input audio
          gr.Dropdown(languages, label="Language", value="hindi"),  # Language selection
      ],
      outputs=["text", "audio"],
      allow_flagging="never",
      title="Farmer's Voice Assistant 🧑‍🌾 Powered by AI4Bharat Tech",
      description="Know about latest farming schemes, this system is powered by tools from AI4Bharat, like IndicASR, IndicTTS and IndicTrans",
  )

  iface.launch(debug=show_log)

launch_gradio_app(show_log=True)