From d87ec0dc6f1499c74f554c28da3cb416c0ef57e2 Mon Sep 17 00:00:00 2001 From: xiaotinghe Date: Wed, 10 Apr 2024 09:15:18 +0000 Subject: [PATCH] updates pdf to markdown --- source/model/etl/code/Dockerfile | 2 +- source/model/etl/code/DockerfileCN | 2 +- source/model/etl/code/layout.py | 11 +++- source/model/etl/code/main.py | 5 +- source/model/etl/code/ocr.py | 86 +++++++++++++++++------------- source/model/etl/code/table.py | 5 +- 6 files changed, 66 insertions(+), 45 deletions(-) diff --git a/source/model/etl/code/Dockerfile b/source/model/etl/code/Dockerfile index 26599aa9..f9011320 100644 --- a/source/model/etl/code/Dockerfile +++ b/source/model/etl/code/Dockerfile @@ -11,7 +11,7 @@ ARG FUNCTION_DIR="/opt/ml/code/" ARG MODEL_DIR="/opt/ml/model/" ENV MODEL_PATH=${MODEL_DIR} -ARG LAYOUT_MODEL_URL="https://xiaotih.seal.ac.cn" +ARG LAYOUT_MODEL_URL="https://aws-gcr-solutions-assets.s3.cn-northwest-1.amazonaws.com.cn/ai-solution-kit/layout-analysis/1.4.0" RUN mkdir -p ${MODEL_DIR} && wget -c $LAYOUT_MODEL_URL/layout_weight.zip -O ${MODEL_DIR}/layout_weight.zip RUN unzip ${MODEL_DIR}/layout_weight.zip -d ${MODEL_DIR} && rm -rf ${MODEL_DIR}/layout_weight.zip diff --git a/source/model/etl/code/DockerfileCN b/source/model/etl/code/DockerfileCN index 682db694..307c6b65 100644 --- a/source/model/etl/code/DockerfileCN +++ b/source/model/etl/code/DockerfileCN @@ -11,7 +11,7 @@ ARG FUNCTION_DIR="/opt/ml/code/" ARG MODEL_DIR="/opt/ml/model/" ENV MODEL_PATH=${MODEL_DIR} -ARG LAYOUT_MODEL_URL="https://xiaotih.seal.ac.cn" +ARG LAYOUT_MODEL_URL="https://aws-gcr-solutions-assets.s3.cn-northwest-1.amazonaws.com.cn/ai-solution-kit/layout-analysis/1.4.0" RUN mkdir -p ${MODEL_DIR} && wget -c $LAYOUT_MODEL_URL/layout_weight.zip -O ${MODEL_DIR}/layout_weight.zip RUN unzip ${MODEL_DIR}/layout_weight.zip -d ${MODEL_DIR} && rm -rf ${MODEL_DIR}/layout_weight.zip diff --git a/source/model/etl/code/layout.py b/source/model/etl/code/layout.py index 8f8ef9b7..a223ba6b 100644 --- a/source/model/etl/code/layout.py +++ b/source/model/etl/code/layout.py @@ -19,11 +19,18 @@ from utils import preprocess, multiclass_nms, postprocess import onnxruntime +import GPUtil +if len(GPUtil.getGPUs()): + provider = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "HEURISTIC"}), "CPUExecutionProvider"] + model = 'layout.onnx' +else: + provider = ["CPUExecutionProvider"] + model = 'layout_s.onnx' class LayoutPredictor(object): def __init__(self): - self.ort_session = onnxruntime.InferenceSession(os.path.join(os.environ['MODEL_PATH'], 'layout.onnx'), providers=["CUDAExecutionProvider"]) - _ = self.ort_session.run(['output'], {'images': np.zeros((1,3,640,640), dtype='float32')})[0] + self.ort_session = onnxruntime.InferenceSession(os.path.join(os.environ['MODEL_PATH'], model), providers=provider) + #_ = self.ort_session.run(['output'], {'images': np.zeros((1,3,640,640), dtype='float32')})[0] self.categorys = ['text', 'title', 'figure', 'table'] def __call__(self, img): ori_im = img.copy() diff --git a/source/model/etl/code/main.py b/source/model/etl/code/main.py index 9022f3e6..b298aa58 100644 --- a/source/model/etl/code/main.py +++ b/source/model/etl/code/main.py @@ -18,6 +18,8 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +os.environ['MODEL_PATH'] = '/home/ubuntu/notebooks/Solutions/temp/' + class StructureSystem(object): def __init__(self): self.mode = 'structure' @@ -310,7 +312,6 @@ def process_pdf_pipeline(request_body): "object_key": "test_data/test_glue_lib/cn_pdf/2023.ccl-2.6.pdf", "destination_bucket": "llm-bot-document-results-icyxu", "mode": "ppstructure", - "lang": "zh", + "lang": "ch", } - print(process_pdf_pipeline(body)) diff --git a/source/model/etl/code/ocr.py b/source/model/etl/code/ocr.py index aa78c1c7..a0587a7f 100644 --- a/source/model/etl/code/ocr.py +++ b/source/model/etl/code/ocr.py @@ -9,14 +9,13 @@ import cv2 from imaug import create_operators, transform from postprocess import build_post_process - -cuda_available = True - -sess_options = onnxruntime.SessionOptions() - -sess_options.intra_op_num_threads = 8 -sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL -sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL +import GPUtil +if len(GPUtil.getGPUs()): + provider = [("CUDAExecutionProvider", {"cudnn_conv_algo_search": "HEURISTIC"}), "CPUExecutionProvider"] + rec_batch_num = 6 +else: + provider = ["CPUExecutionProvider"] + rec_batch_num = 1 class TextClassifier(): def __init__(self): @@ -32,7 +31,7 @@ def __init__(self): } self.postprocess_op = build_post_process(postprocess_params) - self.ort_session = onnxruntime.InferenceSession(self.weights_path, providers=[("CUDAExecutionProvider", {"cudnn_conv_algo_search": "HEURISTIC"})]) + self.ort_session = onnxruntime.InferenceSession(self.weights_path, providers=provider) def resize_norm_img(self, img): imgC, imgH, imgW = self.cls_image_shape @@ -95,21 +94,28 @@ def __call__(self, img_list): return img_list, cls_res class TextDetector(): - def __init__(self): - modelName = 'det_cn.onnx' + def __init__(self, lang): + + if lang=='ch': + modelName = 'det_cn.onnx' + elif lang=='en': + modelName = 'det_en.onnx' + else: + modelName = 'det_cn.onnx' + self.weights_path = os.environ['MODEL_PATH'] + modelName self.det_algorithm = 'DB' self.use_zero_copy_run = False - pre_process_list = [{'DetResizeForTest': {'limit_side_len': 960, 'limit_type': 'max'}}, + pre_process_list = [{'DetResizeForTest': {'limit_side_len': 1280, 'limit_type': 'max'}}, {'NormalizeImage': {'std': [0.229, 0.224, 0.225], 'mean': [0.485, 0.456, 0.406], 'scale': '1./255.', 'order': 'hwc'}}, {'ToCHWImage': None}, {'KeepKeys': {'keep_keys': ['image', 'shape']}}] postprocess_params = {'name': 'DBPostProcess', 'thresh': 0.1, 'box_thresh': 0.1, 'max_candidates': 1000, 'unclip_ratio': 1.5, 'use_dilation': False, 'score_mode': 'fast', 'box_type': 'quad'} self.preprocess_op = create_operators(pre_process_list) self.postprocess_op = build_post_process(postprocess_params) - self.ort_session = onnxruntime.InferenceSession(self.weights_path, providers=[("CUDAExecutionProvider", {"cudnn_conv_algo_search": "HEURISTIC"})]) + self.ort_session = onnxruntime.InferenceSession(self.weights_path, providers=provider) _ = self.ort_session.run(None, {"x": np.zeros([1, 3, 64, 64], dtype='float32')}) # load_pytorch_weights @@ -190,37 +196,40 @@ class TextRecognizer(): def __init__(self, lang='ch'): if lang=='ch': modelName = 'rec_ch.onnx' - else: - modelName = 'rec_en.onnx' - self.weights_path = os.environ['MODEL_PATH'] + modelName - - self.limited_max_width = 1280 - self.limited_min_width = 16 - - self.rec_image_shape = [3, 48, 480] - self.character_type = 'ch' - self.rec_batch_num = 6 - self.rec_algorithm = 'CRNN' - self.use_zero_copy_run = False - if lang=='ch': postprocess_params = { 'name': 'CTCLabelDecode', "character_type": 'ch', "character_dict_path": os.environ['MODEL_PATH'] + 'ppocr_keys_v1.txt', "use_space_char": True } - else: + elif lang=='en': + modelName = 'rec_en.onnx' postprocess_params = { 'name': 'CTCLabelDecode', "character_dict_path": os.environ['MODEL_PATH'] + 'en_dict.txt', "use_space_char": True } + else: + modelName = 'rec_multi_large.onnx' + postprocess_params = { + 'name': 'CTCLabelDecode', + "character_type": 'ch', + "character_dict_path": os.environ['MODEL_PATH'] + 'keys_en_chs_cht_vi_ja_ko.txt', + "use_space_char": True + } + self.weights_path = os.environ['MODEL_PATH'] + modelName + + self.limited_max_width = 1280 + self.limited_min_width = 16 + + self.rec_image_shape = [3, 48, 480] + self.rec_batch_num = rec_batch_num + self.rec_algorithm = 'CRNN' + self.use_zero_copy_run = False + self.postprocess_op = build_post_process(postprocess_params) - self.ort_session = onnxruntime.InferenceSession(self.weights_path, sess_options=sess_options, providers=[("CUDAExecutionProvider", {"cudnn_conv_algo_search": "HEURISTIC"}),"CPUExecutionProvider"]) - #self.ort_session = onnxruntime.InferenceSession(self.weights_path, sess_options=sess_options) - # for s in range(1,40): - # _ = self.ort_session.run(None, {"x": np.zeros([1, 3, 48, s*48], dtype='float32')}) + self.ort_session = onnxruntime.InferenceSession(self.weights_path, providers=provider) def resize_norm_img(self, img, max_wh_ratio): imgC, imgH, imgW = self.rec_image_shape @@ -308,15 +317,20 @@ def sorted_boxes(dt_boxes): return _boxes class TextSystem: def __init__(self): - self.text_detector = TextDetector() - + #self.text_detector = TextDetector() + self.text_detector = { + 'ch': TextDetector('ch'), + 'en': TextDetector('en'), + 'multi': TextDetector('multi'), + } self.text_recognizer = { 'ch': TextRecognizer('ch'), 'en': TextRecognizer('en'), + 'multi': TextRecognizer('multi'), } self.drop_score = 0.4 - self.text_classifier = TextClassifier() + #self.text_classifier = TextClassifier() def get_rotate_crop_image(self, img, points): """ @@ -364,7 +378,7 @@ def get_rotate_crop_image(self, img, points): def __call__(self, img, lang='ch'): ori_im = img.copy() - dt_boxes = self.text_detector(img) + dt_boxes = self.text_detector[lang](img) if dt_boxes is None: return None, None img_crop_list = [] @@ -375,7 +389,7 @@ def __call__(self, img, lang='ch'): tmp_box = copy.deepcopy(dt_boxes[bno]) img_crop = self.get_rotate_crop_image(ori_im, tmp_box) img_crop_list.append(img_crop) - img_crop_list, angle_list = self.text_classifier(img_crop_list) + #img_crop_list, angle_list = self.text_classifier(img_crop_list) rec_res = self.text_recognizer[lang](img_crop_list) filter_boxes, filter_rec_res = [], [] diff --git a/source/model/etl/code/table.py b/source/model/etl/code/table.py index aa687f7e..06f49546 100644 --- a/source/model/etl/code/table.py +++ b/source/model/etl/code/table.py @@ -46,9 +46,8 @@ def __init__(self): self.preprocess_op = create_operators(pre_process_list) self.postprocess_op = build_post_process(postprocess_params) - sess = ort.InferenceSession(os.environ['MODEL_PATH'] + 'table_sim.onnx', providers=["CPUExecutionProvider"]) #, sess_options=sess_options, providers=[("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"})] - #sess = ort.InferenceSession('/home/ubuntu/notebooks/版面分析/ppstructure改进/PaddleOCR/inference/ppstructure/model_fp16.onnx') - #_ = sess.run(None, {'x': np.zeros((1, 3, 488, 488), dtype='float32')}) + sess = ort.InferenceSession(os.environ['MODEL_PATH'] + 'table_sim.onnx', providers=['CPUExecutionProvider']) #, sess_options=sess_options, providers=[("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"})] + _ = sess.run(None, {'x': np.zeros((1, 3, 488, 488), dtype='float32')}) self.predictor, self.input_tensor, self.output_tensors, self.config = sess, sess.get_inputs()[0], None, None