nielsr HF staff commited on
Commit
f41defe
1 Parent(s): f60dcfe

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +9 -15
README.md CHANGED
@@ -64,14 +64,12 @@ text_prompt = "Generate a coco-style caption.\n"
64
  url = "https://huggingface.co/adept/fuyu-8b/resolve/main/bus.png"
65
  image = Image.open(requests.get(url, stream=True).raw)
66
 
67
- inputs = processor(text=text_prompt, images=image, return_tensors="pt")
68
- for k, v in inputs.items():
69
- inputs[k] = v.to("cuda:0")
70
 
71
  # autoregressively generate text
72
  generation_output = model.generate(**inputs, max_new_tokens=7)
73
  generation_text = processor.batch_decode(generation_output[:, -7:], skip_special_tokens=True)
74
- assert generation_text == ['A bus parked on the side of a road.']
75
  ```
76
 
77
  N.B.: The token `|SPEAKER|` is a placeholder token for image patch embeddings, so it will show up in the model context (e.g., in the portion of `generation_output` representing the model context).
@@ -81,25 +79,21 @@ N.B.: The token `|SPEAKER|` is a placeholder token for image patch embeddings, s
81
  Fuyu can also perform some question answering on natural images and charts/diagrams (thought fine-tuning may be required for good performance):
82
  ```python
83
  text_prompt = "What color is the bus?\n"
84
- image_path = "bus.png" # https://huggingface.co/adept-hf-collab/fuyu-8b/blob/main/bus.png
85
- image_pil = Image.open(image_path)
86
 
87
- model_inputs = processor(text=text_prompt, images=[image_pil], device="cuda:0")
88
- for k, v in model_inputs.items():
89
- model_inputs[k] = v.to("cuda:0")
90
 
91
- generation_output = model.generate(**model_inputs, max_new_tokens=6)
92
  generation_text = processor.batch_decode(generation_output[:, -6:], skip_special_tokens=True)
93
  assert generation_text == ["The bus is blue.\n"]
94
 
95
 
96
  text_prompt = "What is the highest life expectancy at birth of male?\n"
97
- image_path = "chart.png" # https://huggingface.co/adept-hf-collab/fuyu-8b/blob/main/chart.png
98
- image_pil = Image.open(image_path)
99
 
100
- model_inputs = processor(text=text_prompt, images=[image_pil], device="cuda:0")
101
- for k, v in model_inputs.items():
102
- model_inputs[k] = v.to("cuda:0")
103
 
104
  generation_output = model.generate(**model_inputs, max_new_tokens=16)
105
  generation_text = processor.batch_decode(generation_output[:, -16:], skip_special_tokens=True)
 
64
  url = "https://huggingface.co/adept/fuyu-8b/resolve/main/bus.png"
65
  image = Image.open(requests.get(url, stream=True).raw)
66
 
67
+ inputs = processor(text=text_prompt, images=image, return_tensors="pt").to("cuda:0")
 
 
68
 
69
  # autoregressively generate text
70
  generation_output = model.generate(**inputs, max_new_tokens=7)
71
  generation_text = processor.batch_decode(generation_output[:, -7:], skip_special_tokens=True)
72
+ assert generation_text == ['A blue bus parked on the side of a road.']
73
  ```
74
 
75
  N.B.: The token `|SPEAKER|` is a placeholder token for image patch embeddings, so it will show up in the model context (e.g., in the portion of `generation_output` representing the model context).
 
79
  Fuyu can also perform some question answering on natural images and charts/diagrams (thought fine-tuning may be required for good performance):
80
  ```python
81
  text_prompt = "What color is the bus?\n"
82
+ url = "https://huggingface.co/adept/fuyu-8b/resolve/main/bus.png"
83
+ image = Image.open(requests.get(url, stream=True).raw)
84
 
85
+ inputs = processor(text=text_prompt, images=image, return_tensors="pt").to("cuda:0")
 
 
86
 
87
+ generation_output = model.generate(**inputs, max_new_tokens=6)
88
  generation_text = processor.batch_decode(generation_output[:, -6:], skip_special_tokens=True)
89
  assert generation_text == ["The bus is blue.\n"]
90
 
91
 
92
  text_prompt = "What is the highest life expectancy at birth of male?\n"
93
+ url = "https://huggingface.co/adept/fuyu-8b/resolve/main/chart.png"
94
+ image = Image.open(requests.get(url, stream=True).raw)
95
 
96
+ model_inputs = processor(text=text_prompt, images=image, return_tensors="pt").to("cuda:0")
 
 
97
 
98
  generation_output = model.generate(**model_inputs, max_new_tokens=16)
99
  generation_text = processor.batch_decode(generation_output[:, -16:], skip_special_tokens=True)