inital port of the tokenizer

whesse · Aug 17, 2012 · 8664535 · 8664535
1 parent 4467ef7
commit 8664535
Show file tree

Hide file tree

Showing 27 changed files with 60,735 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+packages/
+pubspec.lock
+.project
+.children
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,23 @@
+Copyright (c) 2006-2012 The Authors
+
+Contributors:
+James Graham - [email protected]
+Anne van Kesteren - [email protected]
+Lachlan Hunt - [email protected]
+Matt McDonald - [email protected]
+Sam Ruby - [email protected]
+Ian Hickson (Google) - [email protected]
+Thomas Broyer - [email protected]
+Jacques Distler - [email protected]
+Henri Sivonen - [email protected]
+Adam Barth - [email protected]
+Eric Seidel - [email protected]
+The Mozilla Foundation (contributions from Henri Sivonen since 2008)
+David Flanagan (Mozilla) - [email protected]
+Google Inc. (contributed the Dart port) - [email protected]
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,4 +1,39 @@
-html5lib
-========
+html5lib in Pure Dart
+=====================
 
-html5lib in pure dart
+This is a pure [Dart][dart] [html5 parser][html5lib]. It's a port of
+[html5lib](http://code.google.com/p/html5lib/) from Python. Since it's 100%
+Dart you can use it safely from a script or server side app.
+
+Eventually the parse tree API will be compatible with [dart:html][d_html], so
+the same code will work on the client or the server.
+
+This library is not finished. These files from the [html5lib directory][files]
+still need to be ported:
+
+* `html5parser.py`
+* `ihatexml.py`
+* `sanitizer.py`
+* `filters/*`
+* `serializer/*`
+* `treebuilders/*`
+* `treewalkers/*`
+* most of `tests`
+
+
+Running Tests
+-------------
+
+Dependencies are installed using the [Pub Package Manager][pub].
+
+    pub install
+
+    # Run command line tests
+    #export DART_SDK=path/to/dart/sdk
+    tests/run.sh
+
+
+[dart]: http://www.dartlang.org/
+[html5lib]: http://dev.w3.org/html5/spec/parsing.html
+[d_html]: http://api.dartlang.org/docs/continuous/dart_html.html
+[files]: http://html5lib.googlecode.com/hg/python/html5lib/
diff --git a/codecs.dart b/codecs.dart
@@ -0,0 +1,214 @@
+/** Decodes bytes using the correct name. See [decodeBytes]. */
+#library('codecs');
+
+#import('dart:utf');
+#import('dart:io'); // for DecoderException
+
+bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {
+  int end = length != null ? offset + length : bytes.length;
+  return (offset + 3) <= end &&
+      bytes[offset] == 0xEF &&
+      bytes[offset + 1] == 0xBB &&
+      bytes[offset + 2] == 0xBF;
+}
+
+// TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire
+// file, but dart:utf does not expoes stream-based decoders yet.
+/**
+ * Decodes the [bytes] with the provided [encoding] and returns an interator for
+ * the codepoints. Supports the major unicode encodings as well as ascii and
+ * and windows-1252 encodings.
+ */
+Iterable<int> decodeBytes(String encoding, List<int> bytes,
+    [int offset = 0, int length,
+    int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
+  if (length == null) length = bytes.length;
+  final replace = replacementCodepoint;
+  switch (encoding) {
+    case 'ascii':
+      bytes = bytes.getRange(offset, length);
+      // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart
+      for (int byte in bytes) {
+        if (byte > 127) {
+          throw new DecoderException("Illegal ASCII character $byte");
+        }
+      }
+      return bytes;
+
+    case 'windows-1252':
+    case 'cp1252':
+      return decodeWindows1252AsIterable(bytes, offset, length, replace);
+
+    case 'utf-8':
+      // NOTE: to match the behavior of the other decode functions, we eat the
+      // utf-8 BOM here.
+      if (hasUtf8Bom(bytes, offset, length)) {
+        offset += 3;
+        length -= 3;
+      }
+      return decodeUtf8AsIterable(bytes, offset, length, replace);
+
+    case 'utf-16':
+      return decodeUtf16AsIterable(bytes, offset, length, replace);
+    case 'utf-16-be':
+      return decodeUtf16beAsIterable(bytes, offset, length, true, replace);
+    case 'utf-16-le':
+      return decodeUtf16leAsIterable(bytes, offset, length, true, replace);
+
+    case 'utf-32':
+      return decodeUtf32AsIterable(bytes, offset, length, replace);
+    case 'utf-32-be':
+      return decodeUtf32beAsIterable(bytes, offset, length, true, replace);
+    case 'utf-32-le':
+      return decodeUtf32leAsIterable(bytes, offset, length, true, replace);
+
+    default:
+      throw new IllegalArgumentException('Encoding $encoding not supported');
+  }
+}
+
+
+/**
+ * Given a UCS-2 string which may contain UTF-16 surrogate pairs, converts to
+ * a correctly encoded Dart string. If the [input] string does not contain
+ * surrogate pairs, that string instance will be returned unmodified.
+ *
+ * This is useful for fixing strings returned by [JSON.parse], if the JSON
+ * has UTF-16 encoded via surrogate pairs of characters. For example,
+ * `"\ud835\udd04"` should translate to a one character stirng with the code
+ * point `0x01d504`.
+ */
+String decodeUtf16Surrogates(String input) {
+  // Note: don't allocate anything until we know we we need it.
+  List<int> newCodes = null;
+  for (int i = 0; i < input.length; i++) {
+    var c = input.charCodeAt(i);
+    if (0xD800 <= c && c <= 0xDBFF) {
+      int next = i + 1;
+      if (next < input.length) {
+        var d = input.charCodeAt(next);
+        if (0xDC00 <= d && d <= 0xDFFF) {
+          if (newCodes == null) {
+            newCodes = <int>[];
+            for (int j = 0; j < i; j++) newCodes.add(input.charCodeAt(j));
+          }
+          c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00);
+          i = next;
+        }
+      }
+    }
+    if (newCodes != null) newCodes.add(c);
+  }
+
+  if (newCodes == null) return input;
+  return codepointsToString(newCodes);
+}
+
+
+/**
+ * Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as an
+ * iterable. Thus, the consumer can only convert as much of the input as needed.
+ * Set the [replacementCharacter] to null to throw an [IllegalArgumentException]
+ * rather than replace the bad value.
+ */
+IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes,
+    [int offset = 0, int length,
+    int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
+  return new IterableWindows1252Decoder(bytes, offset, length,
+      replacementCodepoint);
+}
+
+
+/**
+ * Return type of [decodeWindows1252AsIterable] and variants. The Iterable type
+ * provides an iterator on demand and the iterator will only translate bytes
+ * as requested by the user of the iterator. (Note: results are not cached.)
+ */
+class IterableWindows1252Decoder implements Iterable<int> {
+  final List<int> bytes;
+  final int offset;
+  final int length;
+  final int replacementCodepoint;
+
+  IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0,
+      int this.length = null,
+      int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);
+
+  Windows1252Decoder iterator() => new Windows1252Decoder(bytes, offset, length,
+          replacementCodepoint);
+}
+
+
+/**
+ * Provides an iterator of Unicode codepoints from windows-1252 encoded bytes.
+ * The parameters can set an offset into a list of bytes (as int), limit the
+ * length of the values to be decoded, and override the default Unicode
+ * replacement character. Set the replacementCharacter to null to throw an
+ * IllegalArgumentException rather than replace the bad value. The return value
+ * from this method can be used as an Iterable (e.g. in a for-loop).
+ */
+class Windows1252Decoder implements Iterator<int> {
+  final int replacementCodepoint;
+  final List<int> _bytes;
+  int _offset;
+  final int _length;
+
+  Windows1252Decoder(List<int> bytes, [int offset = 0, int length,
+      this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT])
+      : _bytes = bytes,
+        _offset = offset,
+        _length = length == null ? bytes.length : length;
+
+  bool hasNext() => _offset < _length;
+
+  int next() {
+    if (!hasNext()) throw const NoMoreElementsException();
+    return _mapChar(_bytes[_offset++]);
+  }
+
+  int _mapChar(int char) {
+    // TODO(jmesserly): this is duplicating entitiesWindows1252 and
+    // replacementCharacters from constants.dart
+    switch (char) {
+      case 0x80: return 0x20AC; // EURO SIGN
+      case 0x82: return 0x201A; // SINGLE LOW-9 QUOTATION MARK
+      case 0x83: return 0x0192; // LATIN SMALL LETTER F WITH HOOK
+      case 0x84: return 0x201E; // DOUBLE LOW-9 QUOTATION MARK
+      case 0x85: return 0x2026; // HORIZONTAL ELLIPSIS
+      case 0x86: return 0x2020; // DAGGER
+      case 0x87: return 0x2021; // DOUBLE DAGGER
+      case 0x88: return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT
+      case 0x89: return 0x2030; // PER MILLE SIGN
+      case 0x8A: return 0x0160; // LATIN CAPITAL LETTER S WITH CARON
+      case 0x8B: return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+      case 0x8C: return 0x0152; // LATIN CAPITAL LIGATURE OE
+      case 0x8E: return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON
+      case 0x91: return 0x2018; // LEFT SINGLE QUOTATION MARK
+      case 0x92: return 0x2019; // RIGHT SINGLE QUOTATION MARK
+      case 0x93: return 0x201C; // LEFT DOUBLE QUOTATION MARK
+      case 0x94: return 0x201D; // RIGHT DOUBLE QUOTATION MARK
+      case 0x95: return 0x2022; // BULLET
+      case 0x96: return 0x2013; // EN DASH
+      case 0x97: return 0x2014; // EM DASH
+      case 0x98: return 0x02DC; // SMALL TILDE
+      case 0x99: return 0x2122; // TRADE MARK SIGN
+      case 0x9A: return 0x0161; // LATIN SMALL LETTER S WITH CARON
+      case 0x9B: return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+      case 0x9C: return 0x0153; // LATIN SMALL LIGATURE OE
+      case 0x9E: return 0x017E; // LATIN SMALL LETTER Z WITH CARON
+      case 0x9F: return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS
+
+      case 0x81:
+      case 0x8D:
+      case 0x8F:
+      case 0x90:
+      case 0x9D:
+        if (replacementCodepoint == null) {
+          throw new IllegalArgumentException(
+              "Invalid windows-1252 code point $char at $_offset");
+        }
+        return replacementCodepoint;
+    }
+    return char;
+  }
+}