From d87ec0dc6f1499c74f554c28da3cb416c0ef57e2 Mon Sep 17 00:00:00 2001
From: xiaotinghe <xiaotih@amazon.com>
Date: Wed, 10 Apr 2024 09:15:18 +0000
Subject: [PATCH] updates pdf to markdown

---
 source/model/etl/code/Dockerfile   |  2 +-
 source/model/etl/code/DockerfileCN |  2 +-
 source/model/etl/code/layout.py    | 11 +++-
 source/model/etl/code/main.py      |  5 +-
 source/model/etl/code/ocr.py       | 86 +++++++++++++++++-------------
 source/model/etl/code/table.py     |  5 +-
 6 files changed, 66 insertions(+), 45 deletions(-)

diff --git a/source/model/etl/code/Dockerfile b/source/model/etl/code/Dockerfile
index 26599aa9..f9011320 100644
--- a/source/model/etl/code/Dockerfile
+++ b/source/model/etl/code/Dockerfile
@@ -11,7 +11,7 @@ ARG FUNCTION_DIR="/opt/ml/code/"
 ARG MODEL_DIR="/opt/ml/model/"
 ENV MODEL_PATH=${MODEL_DIR}
 
-ARG LAYOUT_MODEL_URL="https://xiaotih.seal.ac.cn"
+ARG LAYOUT_MODEL_URL="https://aws-gcr-solutions-assets.s3.cn-northwest-1.amazonaws.com.cn/ai-solution-kit/layout-analysis/1.4.0"
 RUN mkdir -p ${MODEL_DIR} && wget -c $LAYOUT_MODEL_URL/layout_weight.zip -O ${MODEL_DIR}/layout_weight.zip
 RUN unzip ${MODEL_DIR}/layout_weight.zip -d ${MODEL_DIR} && rm -rf ${MODEL_DIR}/layout_weight.zip
 
diff --git a/source/model/etl/code/DockerfileCN b/source/model/etl/code/DockerfileCN
index 682db694..307c6b65 100644
--- a/source/model/etl/code/DockerfileCN
+++ b/source/model/etl/code/DockerfileCN
@@ -11,7 +11,7 @@ ARG FUNCTION_DIR="/opt/ml/code/"
 ARG MODEL_DIR="/opt/ml/model/"
 ENV MODEL_PATH=${MODEL_DIR}
 
-ARG LAYOUT_MODEL_URL="https://xiaotih.seal.ac.cn"
+ARG LAYOUT_MODEL_URL="https://aws-gcr-solutions-assets.s3.cn-northwest-1.amazonaws.com.cn/ai-solution-kit/layout-analysis/1.4.0"
 RUN mkdir -p ${MODEL_DIR} && wget -c $LAYOUT_MODEL_URL/layout_weight.zip -O ${MODEL_DIR}/layout_weight.zip
 RUN unzip ${MODEL_DIR}/layout_weight.zip -d ${MODEL_DIR} && rm -rf ${MODEL_DIR}/layout_weight.zip
 
diff --git a/source/model/etl/code/layout.py b/source/model/etl/code/layout.py
index 8f8ef9b7..a223ba6b 100644
--- a/source/model/etl/code/layout.py
+++ b/source/model/etl/code/layout.py
@@ -19,11 +19,18 @@
 
 from utils import preprocess, multiclass_nms, postprocess
 import onnxruntime
+import GPUtil
+if len(GPUtil.getGPUs()):
+    provider = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "HEURISTIC"}), "CPUExecutionProvider"]
+    model = 'layout.onnx'
+else:
+    provider = ["CPUExecutionProvider"]
+    model = 'layout_s.onnx'
 
 class LayoutPredictor(object):
     def __init__(self):
-        self.ort_session = onnxruntime.InferenceSession(os.path.join(os.environ['MODEL_PATH'], 'layout.onnx'), providers=["CUDAExecutionProvider"])
-        _ = self.ort_session.run(['output'], {'images': np.zeros((1,3,640,640), dtype='float32')})[0]
+        self.ort_session = onnxruntime.InferenceSession(os.path.join(os.environ['MODEL_PATH'], model), providers=provider)
+        #_ = self.ort_session.run(['output'], {'images': np.zeros((1,3,640,640), dtype='float32')})[0]
         self.categorys = ['text', 'title', 'figure', 'table']
     def __call__(self, img):
         ori_im = img.copy()
diff --git a/source/model/etl/code/main.py b/source/model/etl/code/main.py
index 9022f3e6..b298aa58 100644
--- a/source/model/etl/code/main.py
+++ b/source/model/etl/code/main.py
@@ -18,6 +18,8 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+os.environ['MODEL_PATH'] = '/home/ubuntu/notebooks/Solutions/temp/'
+
 class StructureSystem(object):
     def __init__(self):
         self.mode = 'structure'
@@ -310,7 +312,6 @@ def process_pdf_pipeline(request_body):
         "object_key": "test_data/test_glue_lib/cn_pdf/2023.ccl-2.6.pdf",
         "destination_bucket": "llm-bot-document-results-icyxu",
         "mode": "ppstructure",
-        "lang": "zh",
+        "lang": "ch",
     }
-
     print(process_pdf_pipeline(body))
diff --git a/source/model/etl/code/ocr.py b/source/model/etl/code/ocr.py
index aa78c1c7..a0587a7f 100644
--- a/source/model/etl/code/ocr.py
+++ b/source/model/etl/code/ocr.py
@@ -9,14 +9,13 @@
 import cv2
 from imaug import create_operators, transform
 from postprocess import build_post_process
-
-cuda_available = True
-
-sess_options = onnxruntime.SessionOptions()
-
-sess_options.intra_op_num_threads = 8
-sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
-sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+import GPUtil
+if len(GPUtil.getGPUs()):
+    provider = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "HEURISTIC"}), "CPUExecutionProvider"]
+    rec_batch_num = 6
+else:
+    provider = ["CPUExecutionProvider"]
+    rec_batch_num = 1
 
 class TextClassifier():
     def __init__(self):
@@ -32,7 +31,7 @@ def __init__(self):
         }
         self.postprocess_op = build_post_process(postprocess_params)
 
-        self.ort_session = onnxruntime.InferenceSession(self.weights_path, providers=[("CUDAExecutionProvider", {"cudnn_conv_algo_search": "HEURISTIC"})])
+        self.ort_session = onnxruntime.InferenceSession(self.weights_path, providers=provider)
 
     def resize_norm_img(self, img):
         imgC, imgH, imgW = self.cls_image_shape
@@ -95,21 +94,28 @@ def __call__(self, img_list):
         return img_list, cls_res
 
 class TextDetector():
-    def __init__(self):
-        modelName = 'det_cn.onnx'
+    def __init__(self, lang):
+        
+        if lang=='ch':
+            modelName = 'det_cn.onnx'
+        elif lang=='en':
+            modelName = 'det_en.onnx'
+        else:
+            modelName = 'det_cn.onnx'
+            
         self.weights_path = os.environ['MODEL_PATH'] + modelName
 
         self.det_algorithm = 'DB'
         self.use_zero_copy_run = False
 
-        pre_process_list = [{'DetResizeForTest': {'limit_side_len': 960, 'limit_type': 'max'}},
+        pre_process_list = [{'DetResizeForTest': {'limit_side_len': 1280, 'limit_type': 'max'}},
                             {'NormalizeImage': {'std': [0.229, 0.224, 0.225], 'mean': [0.485, 0.456, 0.406], 'scale': '1./255.', 'order': 'hwc'}},
                             {'ToCHWImage': None}, {'KeepKeys': {'keep_keys': ['image', 'shape']}}]
 
         postprocess_params = {'name': 'DBPostProcess', 'thresh': 0.1, 'box_thresh': 0.1, 'max_candidates': 1000, 'unclip_ratio': 1.5, 'use_dilation': False, 'score_mode': 'fast', 'box_type': 'quad'}
         self.preprocess_op = create_operators(pre_process_list)
         self.postprocess_op = build_post_process(postprocess_params)
-        self.ort_session = onnxruntime.InferenceSession(self.weights_path, providers=[("CUDAExecutionProvider", {"cudnn_conv_algo_search": "HEURISTIC"})])
+        self.ort_session = onnxruntime.InferenceSession(self.weights_path, providers=provider)
         _ = self.ort_session.run(None, {"x": np.zeros([1, 3, 64, 64], dtype='float32')})
 
     # load_pytorch_weights
@@ -190,37 +196,40 @@ class TextRecognizer():
     def __init__(self, lang='ch'):
         if lang=='ch':
             modelName = 'rec_ch.onnx'
-        else:
-            modelName = 'rec_en.onnx'
-        self.weights_path = os.environ['MODEL_PATH'] + modelName
-
-        self.limited_max_width = 1280
-        self.limited_min_width = 16
-
-        self.rec_image_shape = [3, 48, 480]
-        self.character_type = 'ch'
-        self.rec_batch_num = 6
-        self.rec_algorithm = 'CRNN'
-        self.use_zero_copy_run = False
-        if lang=='ch':
             postprocess_params = {
                 'name': 'CTCLabelDecode',
                 "character_type": 'ch',
                 "character_dict_path": os.environ['MODEL_PATH'] + 'ppocr_keys_v1.txt',
                 "use_space_char": True
             }
-        else:
+        elif lang=='en':
+            modelName = 'rec_en.onnx'
             postprocess_params = {
                 'name': 'CTCLabelDecode',
                 "character_dict_path": os.environ['MODEL_PATH'] + 'en_dict.txt',
                 "use_space_char": True
             }
+        else:
+            modelName = 'rec_multi_large.onnx'
+            postprocess_params = {
+                'name': 'CTCLabelDecode',
+                "character_type": 'ch',
+                "character_dict_path": os.environ['MODEL_PATH'] + 'keys_en_chs_cht_vi_ja_ko.txt',
+                "use_space_char": True
+            }
+        self.weights_path = os.environ['MODEL_PATH'] + modelName
+
+        self.limited_max_width = 1280
+        self.limited_min_width = 16
+
+        self.rec_image_shape = [3, 48, 480]
+        self.rec_batch_num = rec_batch_num
+        self.rec_algorithm = 'CRNN'
+        self.use_zero_copy_run = False
+
         self.postprocess_op = build_post_process(postprocess_params)
 
-        self.ort_session = onnxruntime.InferenceSession(self.weights_path, sess_options=sess_options, providers=[("CUDAExecutionProvider", {"cudnn_conv_algo_search": "HEURISTIC"}),"CPUExecutionProvider"])
-        #self.ort_session = onnxruntime.InferenceSession(self.weights_path, sess_options=sess_options)
-        # for s in range(1,40):
-        #     _ = self.ort_session.run(None, {"x": np.zeros([1, 3, 48, s*48], dtype='float32')})
+        self.ort_session = onnxruntime.InferenceSession(self.weights_path, providers=provider)
 
     def resize_norm_img(self, img, max_wh_ratio):
         imgC, imgH, imgW = self.rec_image_shape
@@ -308,15 +317,20 @@ def sorted_boxes(dt_boxes):
     return _boxes
 class TextSystem:
     def __init__(self):
-        self.text_detector = TextDetector()
-        
+        #self.text_detector = TextDetector()
+        self.text_detector = {
+            'ch': TextDetector('ch'),
+            'en': TextDetector('en'),
+            'multi': TextDetector('multi'),
+        }
         self.text_recognizer = {
             'ch': TextRecognizer('ch'),
             'en': TextRecognizer('en'),
+            'multi': TextRecognizer('multi'),
         }
         
         self.drop_score = 0.4
-        self.text_classifier = TextClassifier()
+        #self.text_classifier = TextClassifier()
 
     def get_rotate_crop_image(self, img, points):
         """
@@ -364,7 +378,7 @@ def get_rotate_crop_image(self, img, points):
 
     def __call__(self, img, lang='ch'):
         ori_im = img.copy()
-        dt_boxes = self.text_detector(img)
+        dt_boxes = self.text_detector[lang](img)
         if dt_boxes is None:
             return None, None
         img_crop_list = []
@@ -375,7 +389,7 @@ def __call__(self, img, lang='ch'):
             tmp_box = copy.deepcopy(dt_boxes[bno])
             img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
             img_crop_list.append(img_crop)
-        img_crop_list, angle_list = self.text_classifier(img_crop_list)
+        #img_crop_list, angle_list = self.text_classifier(img_crop_list)
 
         rec_res = self.text_recognizer[lang](img_crop_list)
         filter_boxes, filter_rec_res = [], []
diff --git a/source/model/etl/code/table.py b/source/model/etl/code/table.py
index aa687f7e..06f49546 100644
--- a/source/model/etl/code/table.py
+++ b/source/model/etl/code/table.py
@@ -46,9 +46,8 @@ def __init__(self):
         self.preprocess_op = create_operators(pre_process_list)
         self.postprocess_op = build_post_process(postprocess_params)
         
-        sess = ort.InferenceSession(os.environ['MODEL_PATH'] + 'table_sim.onnx', providers=["CPUExecutionProvider"]) #, sess_options=sess_options, providers=[("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"})]
-        #sess = ort.InferenceSession('/home/ubuntu/notebooks/版面分析/ppstructure改进/PaddleOCR/inference/ppstructure/model_fp16.onnx')
-        #_ = sess.run(None, {'x': np.zeros((1, 3, 488, 488), dtype='float32')})
+        sess = ort.InferenceSession(os.environ['MODEL_PATH'] + 'table_sim.onnx', providers=['CPUExecutionProvider']) #, sess_options=sess_options, providers=[("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"})]
+        _ = sess.run(None, {'x': np.zeros((1, 3, 488, 488), dtype='float32')})
         self.predictor, self.input_tensor, self.output_tensors, self.config = sess, sess.get_inputs()[0], None, None