jwkirchenbauer commited on
Commit
ea229c1
β€’
1 Parent(s): 3682749

reduce model selection to just llama2,

Browse files

gradio update forced changes,
readme updates.

Files changed (2) hide show
  1. app.py +5 -3
  2. demo_watermark.py +27 -30
app.py CHANGED
@@ -23,10 +23,12 @@ arg_dict = {
23
  # 'model_name_or_path': 'facebook/opt-125m',
24
  # 'model_name_or_path': 'facebook/opt-1.3b',
25
  # 'model_name_or_path': 'facebook/opt-2.7b',
26
- 'model_name_or_path': 'facebook/opt-6.7b',
27
  # 'model_name_or_path': 'facebook/opt-13b',
28
- 'load_fp16' : True,
29
- # 'load_fp16' : False,
 
 
30
  'prompt_max_length': None,
31
  'max_new_tokens': 200,
32
  'generation_seed': 123,
 
23
  # 'model_name_or_path': 'facebook/opt-125m',
24
  # 'model_name_or_path': 'facebook/opt-1.3b',
25
  # 'model_name_or_path': 'facebook/opt-2.7b',
26
+ # 'model_name_or_path': 'facebook/opt-6.7b',
27
  # 'model_name_or_path': 'facebook/opt-13b',
28
+ 'model_name_or_path': 'meta-llama/Llama-2-7b-hf',
29
+ # 'load_fp16' : True,
30
+ 'load_fp16' : False,
31
+ 'load_bf16' : True,
32
  'prompt_max_length': None,
33
  'max_new_tokens': 200,
34
  'generation_seed': 123,
demo_watermark.py CHANGED
@@ -186,19 +186,27 @@ def parse_args():
186
  default=False,
187
  help="Whether to run model in float16 precsion.",
188
  )
 
 
 
 
 
 
189
  args = parser.parse_args()
190
  return args
191
 
192
  def load_model(args):
193
  """Load and return the model and tokenizer"""
194
 
195
- args.is_seq2seq_model = any([(model_type in args.model_name_or_path) for model_type in ["t5","T0"]])
196
- args.is_decoder_only_model = any([(model_type in args.model_name_or_path) for model_type in ["gpt","opt","bloom"]])
197
  if args.is_seq2seq_model:
198
  model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name_or_path)
199
  elif args.is_decoder_only_model:
200
  if args.load_fp16:
201
  model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.float16, device_map='auto')
 
 
202
  else:
203
  model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)
204
  else:
@@ -206,7 +214,7 @@ def load_model(args):
206
 
207
  if args.use_gpu:
208
  device = "cuda" if torch.cuda.is_available() else "cpu"
209
- if args.load_fp16:
210
  pass
211
  else:
212
  model = model.to(device)
@@ -412,8 +420,13 @@ def detect(input_text, args, tokenizer, device=None, return_green_token_mask=Tru
412
  if error:
413
  output = [["Error","string too short to compute metrics"]]
414
  output += [["",""] for _ in range(6)]
415
-
 
416
  html_output = "[No highlight markup generated]"
 
 
 
 
417
  if green_token_mask is not None:
418
  # hack bc we need a fast tokenizer with charspan support
419
  if "opt" in args.model_name_or_path:
@@ -453,8 +466,6 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
453
  gr.Markdown(
454
  """
455
  ## πŸ’§ [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) πŸ”
456
-
457
- Demo made possible by the HuggingFace πŸ€— [text-generation-inference](https://github.com/huggingface/text-generation-inference) serving framework.
458
  """
459
  )
460
  with gr.Column(scale=1):
@@ -464,7 +475,8 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
464
  """
465
  )
466
  # if model_name_or_path at startup not one of the API models then add to dropdown
467
- all_models = sorted(list(set(list(API_MODEL_MAP.keys())+[args.model_name_or_path])))
 
468
  model_selector = gr.Dropdown(
469
  all_models,
470
  value=args.model_name_or_path,
@@ -488,29 +500,12 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
488
  was likely to have been generated by a model that uses the watermark.
489
 
490
  This space showcases a watermarking approach that can be applied to _any_ generative language model.
491
- For demonstration purposes, the space demos a selection of multi-billion parameter models (see the following note for caveats).
 
 
 
492
  """
493
  )
494
- with gr.Accordion("A note on the available models:",open=False):
495
- gr.Markdown(
496
- """
497
- This demo uses open-source language models. Today, these models are less powerful than proprietary commercial tools like ChatGPT, Claude, Bard, or Bing/Sydney.
498
-
499
- Smaller models like OPT-6.7b are designed to "complete" your prompt, and are not fine-tuned to follow instructions.
500
- For best results, prompt that model with a few sentences that form the beginning of a paragraph, and then allow it to "continue" your paragraph.
501
- Some examples include the opening paragraph of a wikipedia article, or the first few sentences of a story.
502
- Longer prompts that end mid-sentence will result in more fluent generations.
503
-
504
- The larger models available in this demo are fine-tuned to follow instructions but have different strengths and will showcase different
505
- types of watermark behavior. [BLOOMZ](https://huggingface.co/bigscience/bloomz) is an instruction tuned variant of [BLOOM (175B)](https://huggingface.co/bigscience/bloom) capable of following instructions in dozens of languages zero-shot
506
- and can generate long and coherent paragraphs and stories given the right prompt.
507
- The FLAN models [FLAN-t5-xxl (11B)](https://huggingface.co/google/flan-t5-xxl) and [FLAN-UL2 (20B)](https://huggingface.co/google/flan-ul2) are fine-tuned on a variety of in-context few-shot learning NLP tasks,
508
- such as reasoning, and question answering.
509
-
510
- Generally, short, low entropy scenarios where the model has very few choices in terms of correct/suitable responses to the prompt
511
- will not exhibit as strong of a watermark presence, while longer watermarked outputs will produce higher detection statistics.
512
- """
513
- )
514
  gr.Markdown(
515
  """
516
  **[Generate & Detect]**: The first tab shows that the watermark can be embedded with
@@ -526,7 +521,8 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
526
  You can also verify here that the detection has, by design, a low false-positive rate;
527
  This means that human-generated text that you copy into this detector will not be marked as machine-generated.
528
 
529
- You can find more details on how this watermark functions in our [ArXiv preprint](https://arxiv.org/abs/2301.10226).
 
530
  """
531
  )
532
 
@@ -844,7 +840,8 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
844
  select_green_tokens.change(fn=detect_partial, inputs=[output_with_watermark,session_args,session_tokenizer], outputs=[with_watermark_detection_result,session_args,session_tokenizer,html_with_watermark])
845
  select_green_tokens.change(fn=detect_partial, inputs=[detection_input,session_args,session_tokenizer], outputs=[detection_result,session_args,session_tokenizer,html_detection_input])
846
 
847
- demo.queue(concurrency_count=3)
 
848
 
849
  if args.demo_public:
850
  demo.launch(share=True) # exposes app to the internet via randomly generated link
 
186
  default=False,
187
  help="Whether to run model in float16 precsion.",
188
  )
189
+ parser.add_argument(
190
+ "--load_bf16",
191
+ type=str2bool,
192
+ default=False,
193
+ help="Whether to run model in float16 precsion.",
194
+ )
195
  args = parser.parse_args()
196
  return args
197
 
198
  def load_model(args):
199
  """Load and return the model and tokenizer"""
200
 
201
+ args.is_seq2seq_model = any([(model_type in args.model_name_or_path.lower()) for model_type in ["t5","T0"]])
202
+ args.is_decoder_only_model = any([(model_type in args.model_name_or_path.lower()) for model_type in ["gpt","opt","bloom","llama"]])
203
  if args.is_seq2seq_model:
204
  model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name_or_path)
205
  elif args.is_decoder_only_model:
206
  if args.load_fp16:
207
  model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.float16, device_map='auto')
208
+ elif args.load_bf16:
209
+ model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.bfloat16, device_map='auto')
210
  else:
211
  model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)
212
  else:
 
214
 
215
  if args.use_gpu:
216
  device = "cuda" if torch.cuda.is_available() else "cpu"
217
+ if args.load_fp16 or args.load_bf16:
218
  pass
219
  else:
220
  model = model.to(device)
 
420
  if error:
421
  output = [["Error","string too short to compute metrics"]]
422
  output += [["",""] for _ in range(6)]
423
+
424
+
425
  html_output = "[No highlight markup generated]"
426
+
427
+ if green_token_mask is None:
428
+ html_output = "[Visualizing masks with ignore_repeated_bigrams enabled is not supported, toggle off to see the mask for this text. The mask is the same in both cases - only counting/stats are affected.]"
429
+
430
  if green_token_mask is not None:
431
  # hack bc we need a fast tokenizer with charspan support
432
  if "opt" in args.model_name_or_path:
 
466
  gr.Markdown(
467
  """
468
  ## πŸ’§ [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) πŸ”
 
 
469
  """
470
  )
471
  with gr.Column(scale=1):
 
475
  """
476
  )
477
  # if model_name_or_path at startup not one of the API models then add to dropdown
478
+ # all_models = sorted(list(set(list(API_MODEL_MAP.keys())+[args.model_name_or_path])))
479
+ all_models = [args.model_name_or_path]
480
  model_selector = gr.Dropdown(
481
  all_models,
482
  value=args.model_name_or_path,
 
500
  was likely to have been generated by a model that uses the watermark.
501
 
502
  This space showcases a watermarking approach that can be applied to _any_ generative language model.
503
+ For demonstration purposes, the space demos a relatively small open-source language model.
504
+ Such a model is less powerful than proprietary commercial tools like ChatGPT, Claude, or Gemini.
505
+ Generally, prompts that entail a short, low entropy response such as the few word answer to a factual trivia question,
506
+ will not exhibit a strong watermark presence, while longer watermarked outputs will produce higher detection statistics.
507
  """
508
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  gr.Markdown(
510
  """
511
  **[Generate & Detect]**: The first tab shows that the watermark can be embedded with
 
521
  You can also verify here that the detection has, by design, a low false-positive rate;
522
  This means that human-generated text that you copy into this detector will not be marked as machine-generated.
523
 
524
+ You can find more details about how this watermark functions in our paper ["A Watermark for Large Language Models"](https://arxiv.org/abs/2301.10226), presented at ICML 2023.
525
+ Additionally, read about our study on the reliabilty of this watermarking style in ["On the Reliability of Watermarks for Large Language Models"](https://arxiv.org/abs/2306.04634), presented at ICLR 2024.
526
  """
527
  )
528
 
 
840
  select_green_tokens.change(fn=detect_partial, inputs=[output_with_watermark,session_args,session_tokenizer], outputs=[with_watermark_detection_result,session_args,session_tokenizer,html_with_watermark])
841
  select_green_tokens.change(fn=detect_partial, inputs=[detection_input,session_args,session_tokenizer], outputs=[detection_result,session_args,session_tokenizer,html_detection_input])
842
 
843
+ # demo.queue(concurrency_count=3)
844
+ demo.queue()
845
 
846
  if args.demo_public:
847
  demo.launch(share=True) # exposes app to the internet via randomly generated link