Spaces:

srijaydeshpande
/

DeID

Sleeping

App Files Files Community

srijaydeshpande commited on Jul 17

Commit

25632d4

•

1 Parent(s): 5958396

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -78

app.py CHANGED Viewed

@@ -18,13 +18,21 @@ from llama_cpp_agent.chat_history.messages import Roles
 # subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
 # subprocess.run('pip install llama-cpp-agent==0.2.10', shell=True)
 hf_hub_download(
-    repo_id="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF",
-    filename="Meta-Llama-3-8B-Instruct.Q8_0.gguf",
     local_dir = "./models"
 )
 # hf_hub_download(
 #     repo_id="bartowski/Meta-Llama-3-70B-Instruct-GGUF",
 #     filename="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf",
@@ -79,95 +87,107 @@ def txt_to_html(text):
     return html_content
 def deidentify_doc(llm, pdftext, maxtokens, temperature, top_probability):
-    #### Remove Dates ###
-    prompt = "In the following text replace only the calendar dates with term [date]. Example: if input is 'Date of birth: 15/5/1959 calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)' output should be 'Date of birth: [date] calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)'"
-    output = llm.create_chat_completion(
-        messages=[
-            {"role": "assistant", "content": prompt},
-            {
-                "role": "user",
-                "content": pdftext
-            }
-        ],
-        max_tokens=maxtokens,
-        temperature=temperature
-    )
-    output = output['choices'][0]['message']['content']
-    # Remove starting header string in output
-    find_index = output.find(' '.join(pdftext.split()[:3]))
-    if find_index != -1:
-        output = output[find_index:].strip()
-    # #### Remove Locations and Addresses ###
-    prompt = "In the following text replace location or address, such as '3970 Longview Drive, CV36HE' with term [address]. Replace complete GP address, such as 'Phanton Medical Centre, Birmingham, CV36HE' with term [address]. It is important that all addresses are completely replaced with [address]."
     output = llm.create_chat_completion(
         messages=[
-            {"role": "assistant", "content": prompt},
-            {
-                "role": "user",
-                "content": output
-            }
         ],
         max_tokens=maxtokens,
         temperature=temperature
     )
     output = output['choices'][0]['message']['content']
     # Remove starting header string in output
     find_index = output.find(' '.join(pdftext.split()[:3]))
     if find_index != -1:
         output = output[find_index:].strip()
-    #### Remove Names ###
-    prompt = "In the following text replace any person name with term [name]. It is important that all person names are replaced with term [name]. Remove any gender terms 'male' or 'female' if exists."
-    output = llm.create_chat_completion(
-        messages=[
-            {"role": "assistant", "content": prompt},
-            {
-                "role": "user",
-                "content": output
-            }
-        ],
-        max_tokens=maxtokens,
-        temperature=temperature
-    )
-    output = output['choices'][0]['message']['content']
-    # Remove starting header string in output
-    find_index = output.find(' '.join(pdftext.split()[:3]))
-    if find_index != -1:
-        output = output[find_index:].strip()
-    ### Remove Registration Numbers ###
-    prompt = "In the following text replace the nhs number and the case note number with term [ID]. Replace Hospital number with [ID]."
-    output = llm.create_chat_completion(
-        messages=[
-            {"role": "assistant", "content": prompt},
-            {
-                "role": "user",
-                "content": output
-            }
-        ],
-        max_tokens=maxtokens,
-        temperature=temperature
-    )
-    output = output['choices'][0]['message']['content']
-    # Remove starting header string in output
-    find_index = output.find(' '.join(pdftext.split()[:3]))
-    if find_index != -1:
-        output = output[find_index:].strip()
     return output
@@ -175,7 +195,7 @@ def deidentify_doc(llm, pdftext, maxtokens, temperature, top_probability):
 def pdf_to_text(files, maxtokens=2048, temperature=0, top_probability=0.95):
     files=[files]
     llm = Llama(
-        model_path="models/Meta-Llama-3-8B-Instruct.Q8_0.gguf",
         flash_attn=True,
         n_gpu_layers=81,
         n_batch=1024,

 # subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
 # subprocess.run('pip install llama-cpp-agent==0.2.10', shell=True)
+repo_id = "srijaydeshpande/Deid-Fine-Tuned"
+model_id = "deid_finetuned.Q4_K_M.gguf"
 hf_hub_download(
+    repo_id=repo_id,
+    filename=model_id,
     local_dir = "./models"
 )
+# hf_hub_download(
+#     repo_id="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF",
+#     filename="Meta-Llama-3-8B-Instruct.Q8_0.gguf",
+#     local_dir = "./models"
+# )
 # hf_hub_download(
 #     repo_id="bartowski/Meta-Llama-3-70B-Instruct-GGUF",
 #     filename="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf",
     return html_content
 def deidentify_doc(llm, pdftext, maxtokens, temperature, top_probability):
+    prompt = "In the following text, perform the following actions: 1. Replace only the calendar dates with term [date]. Example: if input is 'Date of birth: 15/5/1959 calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)' output should be 'Date of birth: [date] calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)' 2. Replace location or address, such as '3970 Longview Drive, CV36HE' with term [address]. Replace complete GP address, such as 'Phanton Medical Centre, Birmingham, CV36HE' with term [address]. It is important that all addresses are completely replaced with [address]. 3. Replace any person name with term [name]. It is important that all person names are replaced with term [name]. Remove any gender terms 'male' or 'female' if exists. 4. Replace the nhs number and the case note number with term [ID]. Replace Hospital number with [ID]."
     output = llm.create_chat_completion(
         messages=[
+            {"from": "user", "value": prompt + ' Text: ' + pdftext},
         ],
         max_tokens=maxtokens,
         temperature=temperature
     )
     output = output['choices'][0]['message']['content']
     # Remove starting header string in output
     find_index = output.find(' '.join(pdftext.split()[:3]))
     if find_index != -1:
         output = output[find_index:].strip()
+    # #### Remove Dates ###
+    # prompt = "In the following text replace only the calendar dates with term [date]. Example: if input is 'Date of birth: 15/5/1959 calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)' output should be 'Date of birth: [date] calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)'"
+    # output = llm.create_chat_completion(
+    #     messages=[
+    #         {"role": "assistant", "content": prompt},
+    #         {
+    #             "role": "user",
+    #             "content": pdftext
+    #         }
+    #     ],
+    #     max_tokens=maxtokens,
+    #     temperature=temperature
+    # )
+    # output = output['choices'][0]['message']['content']
+    # # Remove starting header string in output
+    # find_index = output.find(' '.join(pdftext.split()[:3]))
+    # if find_index != -1:
+    #     output = output[find_index:].strip()
+    # # #### Remove Locations and Addresses ###
+    # prompt = "In the following text replace location or address, such as '3970 Longview Drive, CV36HE' with term [address]. Replace complete GP address, such as 'Phanton Medical Centre, Birmingham, CV36HE' with term [address]. It is important that all addresses are completely replaced with [address]."
+    # output = llm.create_chat_completion(
+    #     messages=[
+    #         {"role": "assistant", "content": prompt},
+    #         {
+    #             "role": "user",
+    #             "content": output
+    #         }
+    #     ],
+    #     max_tokens=maxtokens,
+    #     temperature=temperature
+    # )
+    # output = output['choices'][0]['message']['content']
+    # # Remove starting header string in output
+    # find_index = output.find(' '.join(pdftext.split()[:3]))
+    # if find_index != -1:
+    #     output = output[find_index:].strip()
+    # #### Remove Names ###
+    # prompt = "In the following text replace any person name with term [name]. It is important that all person names are replaced with term [name]. Remove any gender terms 'male' or 'female' if exists."
+    # output = llm.create_chat_completion(
+    #     messages=[
+    #         {"role": "assistant", "content": prompt},
+    #         {
+    #             "role": "user",
+    #             "content": output
+    #         }
+    #     ],
+    #     max_tokens=maxtokens,
+    #     temperature=temperature
+    # )
+    # output = output['choices'][0]['message']['content']
+    # # Remove starting header string in output
+    # find_index = output.find(' '.join(pdftext.split()[:3]))
+    # if find_index != -1:
+    #     output = output[find_index:].strip()
+    # ### Remove Registration Numbers ###
+    # prompt = "In the following text replace the nhs number and the case note number with term [ID]. Replace Hospital number with [ID]."
+    # output = llm.create_chat_completion(
+    #     messages=[
+    #         {"role": "assistant", "content": prompt},
+    #         {
+    #             "role": "user",
+    #             "content": output
+    #         }
+    #     ],
+    #     max_tokens=maxtokens,
+    #     temperature=temperature
+    # )
+    # output = output['choices'][0]['message']['content']
+    # # Remove starting header string in output
+    # find_index = output.find(' '.join(pdftext.split()[:3]))
+    # if find_index != -1:
+    #     output = output[find_index:].strip()
     return output
 def pdf_to_text(files, maxtokens=2048, temperature=0, top_probability=0.95):
     files=[files]
     llm = Llama(
+        model_path="models/" + model_id,
         flash_attn=True,
         n_gpu_layers=81,
         n_batch=1024,