stardog-union · joshhk72 · Sep 21, 2020 · Sep 21, 2020 · Sep 30, 2020 · Sep 30, 2020
@@ -36,6 +36,7 @@ import {
   getCommonCompletionItemsGivenNamespaces,
   makeCompletionItemFromPrefixedNameAndNamespaceIri,
   ARBITRARILY_LARGE_NUMBER,
+  unescapeString,
 } from 'stardog-language-utils';
 import uniqBy from 'lodash.uniqby';
 import { IToken } from 'chevrotain';
@@ -93,6 +94,35 @@ export class SparqlLanguageServer extends AbstractLanguageServer<
     };
   }
 
+  parseDocument(
+    document: TextDocument
+  ): ReturnType<
+    AbstractLanguageServer<
+      StardogSparqlParser | W3SpecSparqlParser
+    >['parseDocument']
+  > {
+    const content = document.getText();
+    const { indexMap, unescapedString } = unescapeString(content);
+    const { cst, errors, ...otherParseData } = this.parser.parse(
+      unescapedString
+    );
+    const tokens = this.parser.input;
+
+    return {
+      cst: indexMap.size ? this.adjustCstForEscapedText(cst, indexMap) : cst,
+      tokens: indexMap.size
+        ? tokens.map((token) => this.adjustTokenForEscapedText(token, indexMap))
+        : tokens,
+      errors: indexMap.size
+        ? this.adjustErrorsForEscapedText(errors, indexMap)
+        : errors,
+      otherParseData: otherParseData as Omit<
+        ReturnType<StardogSparqlParser['parse']>,
+        'cst' | 'errors'
+      >,
+    };
+  }
+
   handleUpdateCompletionData(update: SparqlCompletionData) {
     // `relationshipCompletionItems` and `typeCompletionItems` must be updated
     // in 2 different scenarios:

@@ -16,7 +16,7 @@ import {
   TextDocumentChangeEvent,
   TextDocumentPositionParams,
 } from 'vscode-languageserver';
-import { Parser, IToken } from 'chevrotain';
+import { Parser, IToken, IRecognitionException } from 'chevrotain';
 import { IStardogParser, isCstNode, traverse, ISemanticError } from 'millan';
 import { ParseStateManager, getParseStateManager } from './parseState';
 
@@ -118,6 +118,104 @@ export abstract class AbstractLanguageServer<
     return this.onInitialization(params);
   }
 
+  adjustCstForEscapedText(
+    input: Object | Object[],
+    indexMap: Map<number, number>
+  ) {
+    if (Array.isArray(input)) {
+      return input.map((child) =>
+        this.adjustCstForEscapedText(child, indexMap)
+      );
+    } else if (typeof input === 'object' && input !== null) {
+      if (input.hasOwnProperty('endColumn')) {
+        return this.adjustTokenForEscapedText(input as IToken, indexMap);
+      }
+      const newInput = { ...input };
+      Object.keys(newInput).forEach((key) => {
+        newInput[key] = this.adjustCstForEscapedText(newInput[key], indexMap);
+      });
+      return newInput;
+    } else {
+      return input;
+    }
+  }
+
+  adjustErrorsForEscapedText(
+    errors: IRecognitionException[],
+    indexMap: Map<number, number>
+  ) {
+    return errors.map((error) =>
+      this.adjustErrorForEscapedText(error, indexMap)
+    );
+  }
+
+  adjustErrorForEscapedText(
+    error: IRecognitionException,
+    indexMap: Map<number, number>
+  ) {
+    const newError: IRecognitionException = { ...error };
+    if (newError.token) {
+      newError.token = this.adjustTokenForEscapedText(newError.token, indexMap);
+    }
+    // @ts-ignore apparently errors from the parser are IRecognitionError[]
+    // according to chevrotain, but they do not have the `previousToken` property
+    if (newError.previousToken) {
+      // @ts-ignore
+      newError.previousToken = this.adjustTokenForEscapedText(
+        // @ts-ignore
+        newError.previousToken,
+        indexMap
+      );
+    }
+    if (newError.resyncedTokens) {
+      newError.resyncedTokens = newError.resyncedTokens.map((token) =>
+        this.adjustTokenForEscapedText(token, indexMap)
+      );
+    }
+    return newError;
+  }
+
+  adjustTokenForEscapedText(token: IToken, indexMap: Map<number, number>) {
+    const newToken: IToken = { ...token };
+    [
+      newToken.startOffset,
+      newToken.endOffset,
+    ] = this.adjustPositionsForEscapedText(
+      newToken.startOffset,
+      newToken.endOffset,
+      indexMap
+    );
+    [newToken.startColumn, newToken.endColumn] = [
+      newToken.startOffset + 1,
+      newToken.endOffset + 1,
+    ];
+    return newToken;
+  }
+
+  adjustPositionsForEscapedText(
+    startPosition: number,
+    endPosition: number,
+    indexMap: Map<number, number>
+  ): [number, number] {
+    let adjustStartNum = 0;
+    let adjustEndNum = 0;
+    for (const [index, value] of indexMap) {
+      if (startPosition <= index) {
+        if (endPosition >= index) {
+          adjustEndNum += value;
+        } else {
+          break;
+        }
+      } else {
+        adjustStartNum += value;
+        adjustEndNum += value;
+      }
+    }
+    const newStartPosition = startPosition + adjustStartNum;
+    const newEndPosition = endPosition + adjustEndNum;
+    return [newStartPosition, newEndPosition];
+  }
+
   abstract onContentChange(
     params: TextDocumentChangeEvent,
     parseResults: ReturnType<AbstractLanguageServer<T>['parseDocument']>

@@ -4,6 +4,7 @@ import {
   namespaceObjToArray,
   namespaceArrayToObj,
   abbreviatePrefixObj,
+  unescapeString,
 } from '../namespaceUtils';
 
 describe('splitNamespace', () => {
@@ -91,3 +92,12 @@ describe('abbreviatePrefixArray and abbreviatePrefixObj', () => {
     );
   });
 });
+
+describe('unescapeString', () => {
+  it('unescaped a string with escaped 4 digit unicode sequences', () => {
+    const string = 'S\\u0045LECT * \\u007B ?s ?p ?o \\u007D';
+    expect(unescapeString(string).unescapedString).toMatch(
+      'SELECT * { ?s ?p ?o }'
+    );
+  });
+});
@@ -72,3 +72,39 @@ export const namespaceArrayToObj = (array) =>
       [alias]: prefix,
     };
   }, {});
+
+const escapeSequence = /\\u([a-fA-F0-9]{4})|\\U([a-fA-F0-9]{8})/g;
+
+export const unescapeString = (
+  item: string
+): { indexMap: Map<number, number>; unescapedString: string } => {
+  const indexMap = new Map<number, number>();
+  let unescapedString = item;
+  let displaceTotal = 0;
+  try {
+    unescapedString = item.replace(
+      escapeSequence,
+      (_: string, unicode4: string, unicode8: string, offset: number) => {
+        const currentIndex = offset - displaceTotal;
+        const displaceNum = unicode4 ? 5 : 9;
+        displaceTotal += displaceNum;
+
+        indexMap.set(currentIndex, displaceNum);
+        let charCode = parseInt(unicode8 || unicode4, 16);
+
+        if (unicode8) {
+          return String.fromCharCode(charCode);
+        } else if (unicode4) {
+          if (charCode <= 0xffff) {
+            return String.fromCharCode(charCode);
+          }
+          return String.fromCharCode(
+            0xd800 + (charCode -= 0x10000) / 0x400,
+            0xdc00 + (charCode & 0x3ff)
+          );
+        }
+      }
+    );
+  } catch (error) {}
+  return { indexMap, unescapedString };
+};