stepfun-ai
/

GOT-OCR2_0

@@ -541,7 +541,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             offset=0,
             sep_style=SeparatorStyle.MPT,
             sep="<|im_end|>",
-)
         conv = conv_mpt.copy()
         conv.append_message(conv.roles[0], qs)
@@ -657,3 +657,183 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             #         with open(html_path_2, 'w') as web_f_new:
             #             web_f_new.write(new_web)

             offset=0,
             sep_style=SeparatorStyle.MPT,
             sep="<|im_end|>",
+        )
         conv = conv_mpt.copy()
         conv.append_message(conv.roles[0], qs)
             #         with open(html_path_2, 'w') as web_f_new:
             #             web_f_new.write(new_web)
+    def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):
+        def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+            best_ratio_diff = float('inf')
+            best_ratio = (1, 1)
+            area = width * height
+            for ratio in target_ratios:
+                target_aspect_ratio = ratio[0] / ratio[1]
+                ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+                if ratio_diff < best_ratio_diff:
+                    best_ratio_diff = ratio_diff
+                    best_ratio = ratio
+                elif ratio_diff == best_ratio_diff:
+                    if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                        best_ratio = ratio
+            # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
+            return best_ratio
+        orig_width, orig_height = image.size
+        aspect_ratio = orig_width / orig_height
+        # calculate the existing image aspect ratio
+        target_ratios = set(
+            (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+            i * j <= max_num and i * j >= min_num)
+        # print(target_ratios)
+        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+        # find the closest aspect ratio to the target
+        target_aspect_ratio = find_closest_aspect_ratio(
+            aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+        # print(target_aspect_ratio)
+        # calculate the target width and height
+        target_width = image_size * target_aspect_ratio[0]
+        target_height = image_size * target_aspect_ratio[1]
+        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+        # resize the image
+        resized_img = image.resize((target_width, target_height))
+        processed_images = []
+        for i in range(blocks):
+            box = (
+                (i % (target_width // image_size)) * image_size,
+                (i // (target_width // image_size)) * image_size,
+                ((i % (target_width // image_size)) + 1) * image_size,
+                ((i // (target_width // image_size)) + 1) * image_size
+            )
+            # split the image
+            split_img = resized_img.crop(box)
+            processed_images.append(split_img)
+        assert len(processed_images) == blocks
+        if use_thumbnail and len(processed_images) != 1:
+            thumbnail_img = image.resize((image_size, image_size))
+            processed_images.append(thumbnail_img)
+        return processed_images
+    def chat_crop(self, tokenizer, image_file, ocr_type, ocr_box='', ocr_color='', render=False, multi_page=False):
+        # Model
+        self.disable_torch_init()
+        image_processor_high =  GOTImageEvalProcessor(image_size=1024)
+        use_im_start_end = True
+        image_token_len = 256
+        image_list = []
+        if multi_page:
+            qs = 'OCR with format across multi pages: '
+            # only for png files
+            import glob
+            from natsort import natsorted
+            patches = glob.glob(image_file + '/*png')
+            patches = natsorted(patches)
+            sub_images = []
+            for sub_image in patches:
+                sub_images.append(self.load_image(sub_image))
+            ll = len(patches)
+        else:
+            qs = 'OCR with format upon the patch reference: '
+            img = self.load_image(image_file)
+            sub_images = self.dynamic_preprocess(img)
+            ll = len(sub_images)
+        for image in sub_images:
+            image_tensor_1 = image_processor_high(image)
+            image_list.append(image_tensor_1)
+        image_list = torch.stack(image_list)
+        print('====new images batch size======:  ',image_list.shape)
+        if use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len*ll + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+        conv_mpt = Conversation(
+            system="""<|im_start|>system
+        You should follow the instructions carefully and explain your answers in detail.""",
+            # system = None,
+            roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+            version="mpt",
+            messages=(),
+            offset=0,
+            sep_style=SeparatorStyle.MPT,
+            sep="<|im_end|>",
+        )
+        conv = conv_mpt.copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        inputs = tokenizer([prompt])
+        input_ids = torch.as_tensor(inputs.input_ids).cuda()
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            output_ids = self.generate(
+                input_ids,
+                images=[(image_list.half().cuda(), image_list.half().cuda())],
+                do_sample=False,
+                num_beams = 1,
+                # no_repeat_ngram_size = 20,
+                streamer=streamer,
+                max_new_tokens=4096,
+                stopping_criteria=[stopping_criteria]
+                )
+        # if render:
+        #     print('==============rendering===============')
+        #     outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
+        #     if outputs.endswith(stop_str):
+        #         outputs = outputs[:-len(stop_str)]
+        #     outputs = outputs.strip()
+        #     html_path = "./render_tools/" + "/content-mmd-to-html.html"
+        #     html_path_2 = "./results/demo.html"
+        #     right_num = outputs.count('\\right')
+        #     left_num = outputs.count('\left')
+        #     if right_num != left_num:
+        #         outputs = outputs.replace('\left(', '(').replace('\\right)', ')').replace('\left[', '[').replace('\\right]', ']').replace('\left{', '{').replace('\\right}', '}').replace('\left|', '|').replace('\\right|', '|').replace('\left.', '.').replace('\\right.', '.')
+        #     outputs = outputs.replace('"', '``').replace('$', '')
+        #     outputs_list = outputs.split('\n')
+        #     gt= ''
+        #     for out in outputs_list:
+        #         gt +=  '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
+        #     gt = gt[:-2]
+        #     with open(html_path, 'r') as web_f:
+        #         lines = web_f.read()
+        #         lines = lines.split("const text =")
+        #         new_web = lines[0] + 'const text ='  + gt  + lines[1]
+        #     with open(html_path_2, 'w') as web_f_new:
+        #         web_f_new.write(new_web)