From f8fdf605702ceafa20aa39e61548e48c76d24b68 Mon Sep 17 00:00:00 2001 From: yuyutaotao <167746126+yuyutaotao@users.noreply.github.com> Date: Wed, 28 Aug 2024 19:21:32 +0800 Subject: [PATCH] fix(web-extract): fix the extractor may fail if there is no (#76) --- packages/web-integration/src/common/tasks.ts | 14 +- .../web-integration/src/extractor/debug.ts | 1 + .../web-integration/src/extractor/dom-util.ts | 9 + .../src/extractor/extractor.ts | 40 ++- .../web-integration/src/extractor/util.ts | 23 +- .../__snapshots__/extractor.test.ts.snap | 37 +++ .../tests/unit-test/extractor.test.ts | 24 ++ .../unit-test/fixtures/extractor/index.html | 312 ++++++++++-------- .../unit-test/fixtures/extractor/input.png | Bin 97854 -> 117421 bytes .../unit-test/fixtures/extractor/output.png | Bin 114050 -> 134556 bytes .../fixtures/extractor/scroll/input.png | Bin 0 -> 28137 bytes .../fixtures/extractor/scroll/output.png | Bin 0 -> 30721 bytes 12 files changed, 296 insertions(+), 164 deletions(-) create mode 100644 packages/web-integration/tests/unit-test/fixtures/extractor/scroll/input.png create mode 100644 packages/web-integration/tests/unit-test/fixtures/extractor/scroll/output.png diff --git a/packages/web-integration/src/common/tasks.ts b/packages/web-integration/src/common/tasks.ts index f229bed3..58a1be55 100644 --- a/packages/web-integration/src/common/tasks.ts +++ b/packages/web-integration/src/common/tasks.ts @@ -98,7 +98,8 @@ export class PageTaskExecutor { type: 'Insight', subType: 'Locate', param: plan.param, - executor: async (param) => { + executor: async (param, taskContext) => { + const { task } = taskContext; let insightDump: InsightDump | undefined; const dumpCollector: DumpSubscriber = (dump) => { insightDump = dump; @@ -124,7 +125,6 @@ export class PageTaskExecutor { }, }); - assert(element, `Element not found: ${param.prompt}`); if (locateResult) { this.taskCache.saveCache({ type: 'locate', @@ -136,6 +136,13 @@ export class PageTaskExecutor { response: locateResult, }); } + if (!element) { + task.log = { + dump: insightDump, + }; + throw new Error(`Element not found: ${param.prompt}`); + } + return { output: { element, @@ -296,8 +303,7 @@ export class PageTaskExecutor { subType: 'Sleep', param: plan.param, executor: async (taskParam) => { - assert(taskParam.timeMs, 'No time to sleep'); - await sleep(taskParam.timeMs); + await sleep(taskParam.timeMs || 3000); }, }; return taskActionSleep; diff --git a/packages/web-integration/src/extractor/debug.ts b/packages/web-integration/src/extractor/debug.ts index 24fcb047..934b7900 100644 --- a/packages/web-integration/src/extractor/debug.ts +++ b/packages/web-integration/src/extractor/debug.ts @@ -2,3 +2,4 @@ import { extractTextWithPosition } from '.'; console.log(extractTextWithPosition(document.body, true)); console.log(JSON.stringify(extractTextWithPosition(document.body, false))); +(window as any).extractTextWithPosition = extractTextWithPosition; diff --git a/packages/web-integration/src/extractor/dom-util.ts b/packages/web-integration/src/extractor/dom-util.ts index d8d0e252..562d535e 100644 --- a/packages/web-integration/src/extractor/dom-util.ts +++ b/packages/web-integration/src/extractor/dom-util.ts @@ -20,3 +20,12 @@ export function isImgElement(node: Node): node is HTMLImageElement { export function isTextElement(node: Node): node is HTMLTextAreaElement { return node.nodeName.toLowerCase() === '#text'; } + +export function isWidgetElement(node: Node): node is HTMLElement { + return ( + node instanceof HTMLElement && + (node.hasAttribute('aria-label') || + node.hasAttribute('aria-controls') || + node.hasAttribute('aria-labelledby')) + ); +} diff --git a/packages/web-integration/src/extractor/extractor.ts b/packages/web-integration/src/extractor/extractor.ts index cd5c9296..23206412 100644 --- a/packages/web-integration/src/extractor/extractor.ts +++ b/packages/web-integration/src/extractor/extractor.ts @@ -1,15 +1,16 @@ -import { NodeType, TEXT_SIZE_THRESHOLD } from '@midscene/shared/constants'; +import { NodeType } from '@midscene/shared/constants'; import { isButtonElement, isFormElement, isImgElement, isTextElement, + isWidgetElement, } from './dom-util'; import { - generateHash, getNodeAttributes, getPseudoElementContent, logger, + midsceneGenerateHash, setDataForNode, setDebugMode, visibleRect, @@ -36,7 +37,7 @@ export interface ElementInfo { center: [number, number]; } -const container: HTMLElement = document.body; +const container: HTMLElement = document.body || document; function generateId(numberId: number) { // const letters = 'ABCDEFGHIJKLMNPRSTUVXYZ'; @@ -89,7 +90,7 @@ export function extractTextWithPosition( if (isFormElement(node)) { const attributes = getNodeAttributes(node); - const nodeHashId = generateHash(attributes.placeholder, rect); + const nodeHashId = midsceneGenerateHash(attributes.placeholder, rect); const selector = setDataForNode(node, nodeHashId); let valueContent = attributes.value || attributes.placeholder || node.textContent || ''; @@ -130,7 +131,7 @@ export function extractTextWithPosition( const attributes = getNodeAttributes(node); const pseudo = getPseudoElementContent(node); const content = node.innerText || pseudo.before || pseudo.after || ''; - const nodeHashId = generateHash(content, rect); + const nodeHashId = midsceneGenerateHash(content, rect); const selector = setDataForNode(node, nodeHashId); elementInfoArray.push({ id: nodeHashId, @@ -155,7 +156,7 @@ export function extractTextWithPosition( if (isImgElement(node)) { const attributes = getNodeAttributes(node); - const nodeHashId = generateHash('', rect); + const nodeHashId = midsceneGenerateHash('', rect); const selector = setDataForNode(node, nodeHashId); elementInfoArray.push({ id: nodeHashId, @@ -188,7 +189,7 @@ export function extractTextWithPosition( if (!text.trim() && attributeKeys.length === 0) { return; } - const nodeHashId = generateHash(text, rect); + const nodeHashId = midsceneGenerateHash(text, rect); const selector = setDataForNode(node, nodeHashId); elementInfoArray.push({ id: nodeHashId, @@ -212,6 +213,31 @@ export function extractTextWithPosition( return; } + if (isWidgetElement(node)) { + const attributes = getNodeAttributes(node); + const nodeHashId = midsceneGenerateHash('', rect); + const selector = setDataForNode(node, nodeHashId); + elementInfoArray.push({ + id: nodeHashId, + indexId: generateId(nodeIndex++), + nodeHashId, + nodeType: NodeType.FORM_ITEM, + locator: selector, + attributes: { + ...attributes, + nodeType: NodeType.FORM_ITEM, + }, + content: '', + rect, + center: [ + Math.round(rect.left + rect.width / 2), + Math.round(rect.top + rect.height / 2), + ], + htmlNode: debugMode ? node : null, + }); + return true; + } + return true; } diff --git a/packages/web-integration/src/extractor/util.ts b/packages/web-integration/src/extractor/util.ts index 5806921d..b3f00f52 100644 --- a/packages/web-integration/src/extractor/util.ts +++ b/packages/web-integration/src/extractor/util.ts @@ -154,13 +154,17 @@ export function visibleRect( if (parentStyle.overflow === 'hidden') { const parentRect = parent.getBoundingClientRect(); const tolerance = 10; + if ( - rect.top < parentRect.top - tolerance && - rect.left < parentRect.left - tolerance && - rect.bottom > parentRect.bottom + tolerance && - rect.right > parentRect.right + tolerance + rect.right < parentRect.left - tolerance || + rect.left > parentRect.right + tolerance || + rect.bottom < parentRect.top - tolerance || + rect.top > parentRect.bottom + tolerance ) { - logger('Element is clipped by an ancestor', parent, rect, parentRect); + logger(el, 'element is partially or totally hidden by an ancestor', { + rect, + parentRect, + }); return false; } } @@ -168,8 +172,8 @@ export function visibleRect( } return { - left: Math.round(rect.left - scrollLeft), - top: Math.round(rect.top - scrollTop), + left: rect.left, + top: rect.top, width: Math.round(rect.width), height: Math.round(rect.height), }; @@ -232,7 +236,7 @@ export function getNodeAttributes( return Object.fromEntries(attributesList); } -export function generateHash(content: string, rect: any): string { +export function midsceneGenerateHash(content: string, rect: any): string { // Combine the input into a string const combined = JSON.stringify({ content, rect }); // Generates the ha-256 hash value @@ -242,4 +246,5 @@ export function generateHash(content: string, rect: any): string { return hashHex.slice(0, 10); } -(window as any).generateHash = generateHash; +(window as any).midsceneGenerateHash = midsceneGenerateHash; +(window as any).midsceneVisibleRect = visibleRect; diff --git a/packages/web-integration/tests/unit-test/__snapshots__/extractor.test.ts.snap b/packages/web-integration/tests/unit-test/__snapshots__/extractor.test.ts.snap index 5d77ac14..3680fec6 100644 --- a/packages/web-integration/tests/unit-test/__snapshots__/extractor.test.ts.snap +++ b/packages/web-integration/tests/unit-test/__snapshots__/extractor.test.ts.snap @@ -372,5 +372,42 @@ exports[`extractor > basic 1`] = ` }, "content": "", }, + { + "attributes": { + "nodeType": "TEXT Node", + }, + "content": "content AAA", + }, + { + "attributes": { + "aria-label": "Click me", + "class": ".widget", + "nodeType": "FORM_ITEM Node", + "role": "button", + }, + "content": "", + }, + { + "attributes": { + "nodeType": "TEXT Node", + }, + "content": "Click me", + }, + { + "attributes": { + "aria-controls": "semi-select-5yxiyng", + "class": ".widget", + "nodeType": "FORM_ITEM Node", + }, + "content": "", + }, + { + "attributes": { + "aria-labelledby": "eval_object.object_type-label", + "class": ".widget", + "nodeType": "FORM_ITEM Node", + }, + "content": "", + }, ] `; diff --git a/packages/web-integration/tests/unit-test/extractor.test.ts b/packages/web-integration/tests/unit-test/extractor.test.ts index 1f1219e1..4fc78518 100644 --- a/packages/web-integration/tests/unit-test/extractor.test.ts +++ b/packages/web-integration/tests/unit-test/extractor.test.ts @@ -35,6 +35,30 @@ describe( await reset(); }); + it('scroll', async () => { + const { page, reset } = await launchPage(`file://${pagePath}`, { + viewport: { + width: 1080, + height: 200, + }, + }); + await page.evaluate(() => { + window.scrollTo(0, 400); + }); + await new Promise((resolve) => setTimeout(resolve, 1000)); + await generateExtractData( + page, + path.join(__dirname, 'fixtures/extractor/scroll'), + { + disableInputImage: false, + disableOutputImage: false, + disableOutputWithoutTextImg: true, + disableResizeOutputImg: true, + disableSnapshot: true, + }, + ); + }); + it('profile ', async () => { const { page, reset } = await launchPage('https://webinfra.org/about'); await new Promise((resolve) => setTimeout(resolve, 1000)); diff --git a/packages/web-integration/tests/unit-test/fixtures/extractor/index.html b/packages/web-integration/tests/unit-test/fixtures/extractor/index.html index 7ce52f9d..68a77c0b 100644 --- a/packages/web-integration/tests/unit-test/fixtures/extractor/index.html +++ b/packages/web-integration/tests/unit-test/fixtures/extractor/index.html @@ -2,170 +2,194 @@ - - - Sample HTML Page - + + + Sample HTML Page + -

Data Record

-

1970-01-01 19:25:01

-

User Name: Stella

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
IDField 2Field 3Field 4Field 5
30SKace CervantesAylin SawyerJefferson KirbySkyla Jefferson
70UFlorence DavenportDariel AcevedoAshlynn DelacruzMemphis Leal
3AYCrystal NewmanAnderson BrownCharlotte GriffithFranklin Everett
YPGKori PayneEdward BlevinsAila GillMatthias Reed
ZENMagnolia DukeKalel GloverAlessia BartonCassius Peck
-
-

Form

- - - - - -

-    
-
-
-    small_img
-  
- - -
+

Data Record

+

1970-01-01 19:25:01

+

User Name: Stella

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IDField 2Field 3Field 4Field 5
30SKace CervantesAylin SawyerJefferson KirbySkyla Jefferson
70UFlorence DavenportDariel AcevedoAshlynn DelacruzMemphis Leal
3AYCrystal NewmanAnderson BrownCharlotte GriffithFranklin Everett
YPGKori PayneEdward BlevinsAila GillMatthias Reed
ZENMagnolia DukeKalel GloverAlessia BartonCassius Peck
-
-
    -
  • English
  • -
  • 中文
  • -
  • Tiếng Việt -
  • -
-
-
-
+

Form

+ + + + + +

+        
+        small_img
+        
 
-  
-  
- - -
+ - -
- - - -
- - -