diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts index 581640c19..e7f647790 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts @@ -76,4 +76,11 @@ export class MappingSourceTextProvider { isBlank(text: string): boolean { return this.normalizer.isBlank(text); } + + /** + * Cleanup the consumed text position history + */ + resetHistory() { + this.provider.resetHistory(); + } } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/TextProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/TextProvider.ts index 7a062350c..c91d35603 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/TextProvider.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/TextProvider.ts @@ -133,4 +133,11 @@ export class TextProvider { }); this.history = validSpans.slice(0, MAX_HISTORY); } + + /** + * Cleanup the consumed text position history + */ + resetHistory() { + this.history = []; + } } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/__tests__/getTextBoxMapping.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/__tests__/getTextBoxMapping.test.ts index 52343acde..5af957d64 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/__tests__/getTextBoxMapping.test.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/__tests__/getTextBoxMapping.test.ts @@ -61,4 +61,35 @@ describe('getTextBoxMapping', () => { expect(normalized2?.cell).toBe(target.cellAt(1)); // mapped to 2nd cell in target expect(normalized2?.span).toEqual([0, 3]); // mapped to the span [0, 3] in the 2nd cell }); + + it('should map text after long match properly', () => { + const source = new SimpleTextLayout([ + // text_mappings box + { + text: '1 abc def ghi jkl 2 abc def ghi jkl 3 abc def ghi jkl 4 abc def ghi jkl', + bbox: bbox(0, 0, 2, 4) + } + ]); + const target = new SimpleTextLayout([ + // 1st line + { bbox: bbox(0, 0), text: '1 abc def ghi' }, + { bbox: bbox(1, 0), text: 'jkl' }, + // 2nd line + { bbox: bbox(0, 1), text: '2 abc def ghi' }, + { bbox: bbox(1, 1), text: 'jkl' }, + // 3nd line + { bbox: bbox(0, 2), text: '3 abc def ghi' }, + { bbox: bbox(1, 2), text: 'jkl' }, + // 4th line + { bbox: bbox(0, 3), text: '4 abc def ghi' }, + { bbox: bbox(1, 3), text: 'jkl' } + ]); + const mapping = getTextBoxMappings(source, target); + + // verify the mapping result of the 1st 'jkl' in the source + const result1 = mapping.apply(source.cellAt(0).getPartial([14, 17])); + expect(result1).toHaveLength(1); + const normalized1 = result1[0].cell?.getNormalized(); + expect(normalized1?.cell).toBe(target.cellAt(1)); // mapped to first cell in target + }); }); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/getTextBoxMapping.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/getTextBoxMapping.ts index 97703e82f..787334583 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/getTextBoxMapping.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textBoxMapping/getTextBoxMapping.ts @@ -51,6 +51,7 @@ export function getTextBoxMappings< matchInSource.markAsMapped(); } }); + source.resetHistory(); } return builder.toTextBoxMapping(); } @@ -178,6 +179,13 @@ class Source provider.resetHistory()); + } + /** * Find the best source (larger text layout cell) where text `textToMatch` is in * @param sources source (larger) text layout cells overlapping the current target cell diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textLayout/PdfTextContentTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textLayout/PdfTextContentTextLayout.ts index a57dd011c..fd51fbb0a 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textLayout/PdfTextContentTextLayout.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfHighlight/utils/textLayout/PdfTextContentTextLayout.ts @@ -1,5 +1,6 @@ import { bboxesIntersect } from 'components/DocumentPreview/utils/box'; import { PDFPageViewport, PDFPageViewportOptions, TextContentItem } from 'pdfjs-dist'; +import { sortBy } from 'lodash'; import { Bbox, TextSpan } from '../../types'; import { BaseTextLayoutCell } from './BaseTextLayout'; import { getAdjustedCellByOffsetByDom } from './dom'; @@ -18,16 +19,29 @@ export class PdfTextContentTextLayout implements TextLayout { + const htmlBboxIndicesByIndex = textContentItems.map(item => { const cellBbox = getBbox(item, this.viewport); - let isInHtmlBbox = false; if (htmlBboxInfo?.bboxes?.length) { - isInHtmlBbox = htmlBboxInfo.bboxes.some(bbox => { + return htmlBboxInfo.bboxes.findIndex(bbox => { return bboxesIntersect(cellBbox, [bbox.left, bbox.top, bbox.right, bbox.bottom]); }); } + return -1; + }); + const cells = textContentItems.map((item, index) => { + const cellBbox = getBbox(item, this.viewport); + const isInHtmlBbox = htmlBboxIndicesByIndex[index] >= 0; return new PdfTextContentTextLayoutCell(this, index, item, pageNum, cellBbox, isInHtmlBbox); }); + + const htmlBboxIndicesPatched = htmlBboxIndicesByIndex.reduce( + (acc, value, index) => { + acc[index] = value >= 0 ? value : index > 0 ? acc[index - 1] : -1; + return acc; + }, + [...htmlBboxIndicesByIndex] + ); + this.cells = sortBy(cells, cell => htmlBboxIndicesPatched[cell.id]); } /** diff --git a/yarn.lock b/yarn.lock index 24b01ff2f..67b32f48b 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2163,7 +2163,7 @@ __metadata: languageName: node linkType: hard -"@ibm-watson/discovery-react-components@^1.5.0-beta.29, @ibm-watson/discovery-react-components@workspace:packages/discovery-react-components": +"@ibm-watson/discovery-react-components@^1.5.0-beta.30, @ibm-watson/discovery-react-components@workspace:packages/discovery-react-components": version: 0.0.0-use.local resolution: "@ibm-watson/discovery-react-components@workspace:packages/discovery-react-components" dependencies: @@ -10664,7 +10664,7 @@ __metadata: dependencies: "@carbon/icons": ^10.5.0 "@cypress/webpack-preprocessor": ^5.11.0 - "@ibm-watson/discovery-react-components": ^1.5.0-beta.29 + "@ibm-watson/discovery-react-components": ^1.5.0-beta.30 "@ibm-watson/discovery-styles": ^1.5.0-beta.24 "@testing-library/cypress": ^7.0.7 "@types/proper-url-join": ^2