From 7f8fd76d4be9e03005e4a8653f920fd2b165fc3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Ber=C3=A1nek?= Date: Mon, 14 Apr 2025 11:56:23 +0200 Subject: [PATCH 1/2] Add html5ever-0.13.0 benchmark --- .../html5ever-0.31.0/.cargo_vcs_info.json | 6 + .../html5ever-0.31.0/Cargo.lock | 831 ++++++++ .../html5ever-0.31.0/Cargo.toml | 88 + .../html5ever-0.31.0/Cargo.toml.orig | 30 + .../html5ever-0.31.0/LICENSE-APACHE | 201 ++ .../html5ever-0.31.0/LICENSE-MIT | 25 + .../html5ever-0.31.0/README.md | 61 + .../html5ever-0.31.0/benches/html5ever.rs | 80 + .../data/bench/lipsum-zh.html | 19 + .../html5ever-0.31.0/data/bench/lipsum.html | 40 + .../data/bench/medium-fragment.html | 24 + .../data/bench/small-fragment.html | 7 + .../html5ever-0.31.0/data/bench/strong.html | 1 + .../data/bench/tiny-fragment.html | 1 + .../html5ever-0.31.0/examples/arena.rs | 356 ++++ .../html5ever-0.31.0/examples/capi/tokenize.c | 74 + .../examples/noop-tokenize.rs | 47 + .../examples/noop-tree-builder.rs | 132 ++ .../examples/print-tree-actions.rs | 185 ++ .../html5ever-0.31.0/examples/tokenize.rs | 111 + .../html5ever-0.31.0/perf-config.json | 4 + .../html5ever-0.31.0/src/driver.rs | 137 ++ .../html5ever-0.31.0/src/lib.rs | 31 + .../html5ever-0.31.0/src/macros.rs | 36 + .../html5ever-0.31.0/src/serialize/mod.rs | 255 +++ .../src/tokenizer/char_ref/mod.rs | 445 ++++ .../src/tokenizer/interface.rs | 99 + .../html5ever-0.31.0/src/tokenizer/mod.rs | 1761 ++++++++++++++++ .../html5ever-0.31.0/src/tokenizer/states.rs | 97 + .../html5ever-0.31.0/src/tree_builder/data.rs | 170 ++ .../html5ever-0.31.0/src/tree_builder/mod.rs | 1789 +++++++++++++++++ .../src/tree_builder/rules.rs | 1495 ++++++++++++++ .../src/tree_builder/tag_sets.rs | 114 ++ .../src/tree_builder/types.rs | 98 + .../html5ever-0.31.0/src/util/str.rs | 42 + 35 files changed, 8892 insertions(+) create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/.cargo_vcs_info.json create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/Cargo.lock create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/Cargo.toml create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/Cargo.toml.orig create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/LICENSE-APACHE create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/LICENSE-MIT create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/README.md create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/benches/html5ever.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/data/bench/lipsum-zh.html create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/data/bench/lipsum.html create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/data/bench/medium-fragment.html create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/data/bench/small-fragment.html create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/data/bench/strong.html create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/data/bench/tiny-fragment.html create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/examples/arena.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/examples/capi/tokenize.c create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/examples/noop-tokenize.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/examples/noop-tree-builder.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/examples/print-tree-actions.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/examples/tokenize.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/perf-config.json create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/src/driver.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/src/lib.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/src/macros.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/src/serialize/mod.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/char_ref/mod.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/interface.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/mod.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/states.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/data.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/mod.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/rules.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/tag_sets.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/types.rs create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/src/util/str.rs diff --git a/collector/compile-benchmarks/html5ever-0.31.0/.cargo_vcs_info.json b/collector/compile-benchmarks/html5ever-0.31.0/.cargo_vcs_info.json new file mode 100644 index 000000000..9dad99bef --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/.cargo_vcs_info.json @@ -0,0 +1,6 @@ +{ + "git": { + "sha1": "ce481196ff2e60eb536d9c022f4ca00bd5181f15" + }, + "path_in_vcs": "html5ever" +} \ No newline at end of file diff --git a/collector/compile-benchmarks/html5ever-0.31.0/Cargo.lock b/collector/compile-benchmarks/html5ever-0.31.0/Cargo.lock new file mode 100644 index 000000000..f14c97347 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/Cargo.lock @@ -0,0 +1,831 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bitflags" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" + +[[package]] +name = "bumpalo" +version = "3.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.5.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e77c3243bd94243c03672cb5154667347c457ca271254724f9f393aee1c05ff" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b26884eb4b57140e4d2d93652abfa49498b938b3c9179f9fc487b0acc3edad7" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" + +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", +] + +[[package]] +name = "hermit-abi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" + +[[package]] +name = "html5ever" +version = "0.31.0" +dependencies = [ + "criterion", + "log", + "mac", + "markup5ever", + "match_token", + "typed-arena", +] + +[[package]] +name = "is-terminal" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e19b23d53f35ce9f56aebc7d1bb4e6ac1e9c0db7ac85c8d1760c04379edced37" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" + +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.169" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" + +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ba2225413ed418d540a2c8247d794f4b0527a021da36f69c05344d716dc44c1" +dependencies = [ + "log", + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "match_token" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "oorandom" +version = "11.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", +] + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + +[[package]] +name = "proc-macro2" +version = "1.0.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "redox_syscall" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "rustversion" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" + +[[package]] +name = "ryu" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.217" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.217" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.138" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "string_cache" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "938d512196766101d333398efde81bc1f37b00cb42c2f8350e5df639f040bbbe" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + +[[package]] +name = "syn" +version = "2.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "typed-arena" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" + +[[package]] +name = "unicode-ident" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/collector/compile-benchmarks/html5ever-0.31.0/Cargo.toml b/collector/compile-benchmarks/html5ever-0.31.0/Cargo.toml new file mode 100644 index 000000000..016e26afe --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/Cargo.toml @@ -0,0 +1,88 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2021" +rust-version = "1.70.0" +name = "html5ever" +version = "0.31.0" +authors = ["The html5ever Project Developers"] +build = false +autolib = false +autobins = false +autoexamples = false +autotests = false +autobenches = false +description = "High-performance browser-grade HTML5 parser" +documentation = "https://docs.rs/html5ever" +readme = "README.md" +keywords = [ + "html", + "html5", + "parser", + "parsing", +] +categories = [ + "parser-implementations", + "web-programming", +] +license = "MIT OR Apache-2.0" +repository = "https://github.com/servo/html5ever" + +[lib] +name = "html5ever" +path = "src/lib.rs" + +[[example]] +name = "arena" +path = "examples/arena.rs" + +[[example]] +name = "noop-tokenize" +path = "examples/noop-tokenize.rs" + +[[example]] +name = "noop-tree-builder" +path = "examples/noop-tree-builder.rs" + +[[example]] +name = "print-tree-actions" +path = "examples/print-tree-actions.rs" + +[[example]] +name = "tokenize" +path = "examples/tokenize.rs" + +[[bench]] +name = "html5ever" +path = "benches/html5ever.rs" +harness = false + +[dependencies.log] +version = "0.4" + +[dependencies.mac] +version = "0.1" + +[dependencies.markup5ever] +version = "0.16" + +[dependencies.match_token] +version = "0.1" + +[dev-dependencies.criterion] +version = "0.5" + +[dev-dependencies.typed-arena] +version = "2.0.2" + +[features] +trace_tokenizer = [] diff --git a/collector/compile-benchmarks/html5ever-0.31.0/Cargo.toml.orig b/collector/compile-benchmarks/html5ever-0.31.0/Cargo.toml.orig new file mode 100644 index 000000000..f6f3576af --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/Cargo.toml.orig @@ -0,0 +1,30 @@ +[package] +name = "html5ever" +version = "0.31.0" +authors = [ "The html5ever Project Developers" ] +license = "MIT OR Apache-2.0" +repository = "https://github.com/servo/html5ever" +description = "High-performance browser-grade HTML5 parser" +documentation = "https://docs.rs/html5ever" +categories = [ "parser-implementations", "web-programming" ] +keywords = ["html", "html5", "parser", "parsing"] +edition = "2021" +readme = "../README.md" +rust-version.workspace = true + +[features] +trace_tokenizer = [] + +[dependencies] +log = "0.4" +mac = "0.1" +markup5ever = { version = "0.16", path = "../markup5ever" } +match_token = { workspace = true } + +[dev-dependencies] +criterion = "0.5" +typed-arena = "2.0.2" + +[[bench]] +name = "html5ever" +harness = false diff --git a/collector/compile-benchmarks/html5ever-0.31.0/LICENSE-APACHE b/collector/compile-benchmarks/html5ever-0.31.0/LICENSE-APACHE new file mode 100644 index 000000000..16fe87b06 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/collector/compile-benchmarks/html5ever-0.31.0/LICENSE-MIT b/collector/compile-benchmarks/html5ever-0.31.0/LICENSE-MIT new file mode 100644 index 000000000..6e4510217 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2014 The html5ever Project Developers + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/collector/compile-benchmarks/html5ever-0.31.0/README.md b/collector/compile-benchmarks/html5ever-0.31.0/README.md new file mode 100644 index 000000000..746dff207 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/README.md @@ -0,0 +1,61 @@ +# html5ever + +[![Build Status](https://github.com/servo/html5ever/actions/workflows/main.yml/badge.svg)](https://github.com/servo/html5ever/actions) +[![crates.io](https://img.shields.io/crates/v/html5ever.svg)](https://crates.io/crates/html5ever) + +[API Documentation][API documentation] + +html5ever is an HTML parser developed as part of the [Servo][] project. + +It can parse and serialize HTML according to the [WHATWG](https://whatwg.org/) specs (aka "HTML5"). However, there are some differences in the actual behavior currently, most of which are documented [in the bug tracker][]. html5ever passes all tokenizer tests from [html5lib-tests][], with most tree builder tests outside of the unimplemented features. The goal is to pass all html5lib tests, while also providing all hooks needed by a production web browser, e.g. `document.write`. + +Note that the HTML syntax is very similar to XML. For correct parsing of XHTML, use an XML parser (that said, many XHTML documents in the wild are serialized in an HTML-compatible form). + +html5ever is written in [Rust][], therefore it avoids the notorious security problems that come along with using C. Being built with Rust also makes the library come with the high-grade performance you would expect from an HTML parser written in C. html5ever is basically a C HTML parser, but without needing a garbage collector or other heavy runtime processes. + + +## Getting started in Rust + +Add html5ever as a dependency in your [`Cargo.toml`](https://crates.io/) file: + +```toml +[dependencies] +html5ever = "0.30" +``` + +You should also take a look at [`examples/html2html.rs`], [`examples/print-rcdom.rs`], and the [API documentation][]. + + +## Getting started in other languages + +Bindings for Python and other languages are much desired. + + +## Working on html5ever + +To fetch the test suite, you need to run + +``` +git submodule update --init +``` + +Run `cargo doc` in the repository root to build local documentation under `target/doc/`. + + +## Details + +html5ever uses callbacks to manipulate the DOM, therefore it does not provide any DOM tree representation. + +html5ever exclusively uses UTF-8 to represent strings. In the future it will support other document encodings (and UCS-2 `document.write`) by converting input. + +The code is cross-referenced with the WHATWG syntax spec, and eventually we will have a way to present code and spec side-by-side. + +html5ever builds against the official stable releases of Rust, though some optimizations are only supported on nightly releases. + +[API documentation]: https://docs.rs/html5ever +[Servo]: https://github.com/servo/servo +[Rust]: https://www.rust-lang.org/ +[in the bug tracker]: https://github.com/servo/html5ever/issues?q=is%3Aopen+is%3Aissue+label%3Aweb-compat +[html5lib-tests]: https://github.com/html5lib/html5lib-tests +[`examples/html2html.rs`]: https://github.com/servo/html5ever/blob/main/rcdom/examples/html2html.rs +[`examples/print-rcdom.rs`]: https://github.com/servo/html5ever/blob/main/rcdom/examples/print-rcdom.rs diff --git a/collector/compile-benchmarks/html5ever-0.31.0/benches/html5ever.rs b/collector/compile-benchmarks/html5ever-0.31.0/benches/html5ever.rs new file mode 100644 index 000000000..0df9a69c4 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/benches/html5ever.rs @@ -0,0 +1,80 @@ +#[macro_use] +extern crate criterion; +extern crate html5ever; + +use std::fs; +use std::path::PathBuf; + +use criterion::{black_box, Criterion}; + +use html5ever::tendril::*; +use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer}; + +struct Sink; + +impl TokenSink for Sink { + type Handle = (); + + fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<()> { + // Don't use the token, but make sure we don't get + // optimized out entirely. + black_box(token); + TokenSinkResult::Continue + } +} + +fn run_bench(c: &mut Criterion, name: &str) { + let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + path.push("data/bench/"); + path.push(name); + let mut file = fs::File::open(&path).expect("can't open file"); + + // Read the file and treat it as an infinitely repeating sequence of characters. + let mut file_input = ByteTendril::new(); + file.read_to_tendril(&mut file_input) + .expect("can't read file"); + let file_input: StrTendril = file_input.try_reinterpret().unwrap(); + let size = file_input.len(); + let mut stream = file_input.chars().cycle(); + + // Break the input into chunks of 1024 chars (= a few kB). + // This simulates reading from the network. + let mut input = vec![]; + let mut total = 0usize; + while total < size { + // The by_ref() call is important, otherwise we get wrong results! + // See rust-lang/rust#18045. + let sz = std::cmp::min(1024, size - total); + input.push(stream.by_ref().take(sz).collect::().to_tendril()); + total += sz; + } + + let test_name = format!("html tokenizing {name}"); + + c.bench_function(&test_name, move |b| { + b.iter(|| { + let tok = Tokenizer::new(Sink, Default::default()); + let buffer = BufferQueue::default(); + // We are doing clone inside the bench function, this is not ideal, but possibly + // necessary since our iterator consumes the underlying buffer. + for buf in input.clone().into_iter() { + buffer.push_back(buf); + let _ = tok.feed(&buffer); + } + let _ = tok.feed(&buffer); + tok.end(); + }) + }); +} + +fn html5ever_benchmark(c: &mut Criterion) { + run_bench(c, "lipsum.html"); + run_bench(c, "lipsum-zh.html"); + run_bench(c, "medium-fragment.html"); + run_bench(c, "small-fragment.html"); + run_bench(c, "tiny-fragment.html"); + run_bench(c, "strong.html"); +} + +criterion_group!(benches, html5ever_benchmark); +criterion_main!(benches); diff --git a/collector/compile-benchmarks/html5ever-0.31.0/data/bench/lipsum-zh.html b/collector/compile-benchmarks/html5ever-0.31.0/data/bench/lipsum-zh.html new file mode 100644 index 000000000..1efe2fa52 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/data/bench/lipsum-zh.html @@ -0,0 +1,19 @@ +甀 曒檃檑 糲蘥蠩 櫋瀩, 嗢 剆坲姏 齸圞趲 葠蜄蛖 砎粁 擙樲橚 噅尰崺 廘榙榾 誙 煘煓, 腶 敔耜 逯郹酟 蝪蝩覤 顲鱭鸋, 趍 櫱瀯灂 碄碆碃 矠筸 砫粍 耜僇鄗 搋朠楟 溔 齝囃 槏 鼏噳墺 滭滹漇, 骱 翀胲胵 蝑蝞蝢 鑅鷖 + +痯 荾莯 驧鬤鸕 梪涫湴, 踙 黈龠懱 椼毸溠 蠬襱覾 滱漮, 耜僇鄗 沀皯竻 饇馦 蒏 斠 墐墆墏 艎艑蔉 貕貔 廑憀慡 嫬廙彯 鳻嶬 跿, 飹勫嫢 熤熡磎 慛 賗跿, 灂瀿 綧 摿斠榱 橀槶澉 碄碆碃 鯦鯢鯡 踾踶輵 鍌鍗鍷 溿 滭滹, 綧 藙藨 蝪蝩覤 渮湸湤, 輗 鰝鰨 犌犐瑆 櫞氌瀙 鵳齖齘 塝 寁崏 摨敹暯 檌檒濦 滭滹漇, 撖 輈鄍 婸媥媕 漦澌潬, 膣 姛帡恦 莃荶衒 昢炾 + +儮嬼懫 馦騧騜 覛谼貆 墏壾 鋱, 緦 豥趍 翍脝艴 絟缾臮 摲 輴郺 篧糑縒 獧瞝瞣 袀豇貣, 廞 鶄鵳 肒芅邥 泏狔狑 覛谼貆 儋圚墝 滭滹漇 鰝鰨 蔰, 忁曨曣 蝪蝩覤 埱娵徖 萴葂 跬, 緷 巂鞪 晛桼桾 踥踕踛 翣聜蒢 虥諰諨 箄縴儳 磼簎 殠, 銇 烺焆琀 鱐鱍鱕 垽娭屔 齫儽, 蒮 靮傿 烍烚珜 蒝蒧蓏 璈皞緪 圪妀 綧 溮煡煟 轛轝酅 濷瓂癚, 篧糑縒 谾踘遳 讘麡 腶, 鯦鯢鯡 邆錉霋 鋱 蛚袲 鋱鋟鋈 瀷瀹藶 騉鬵 嗢 + +蝺 鰔鶟 濇燖燏 梪涫湴 齫儽戃, 馺 髬魆 齴讘麡 袟袘觕, 甀瞂硾 鍹餳駷 邆錉霋 曮禷 瑽 虰豖 瀿犨皫 蜬蝁蜠 檹瀔濼 榯, 獝瘝磈 輣鋄銶 抏旲 諃 褌 緳廞徲 轞騹鼚 瘵瘲 媥媕 踙 簎艜薤 鸙讟钃 + +滘 鐩闤鞿 轞騹鼚 絟缾臮 碃稘, 鮥鴮 輗 渳湥牋 獿譿躐 趉軨鄇 鋑鋡髬 嶜憃撊 磑 棳棔 滜溙 蔏 烺焆琀 鱐鱍鱕 撌斳暩 緅 彃慔 釢髟偛 礯籔羻 + +鏾鐇闠 擙樲橚 塓塕 慔 笢笣 壾 婸媥媕 奫嫮嫳, 愄揎揇 趡趛踠 瑽 秎穾, 腤萰 蛃袚觙 玝甿虮 濆澓澋 魦 綧 瘱瘵瘲 擙樲橚 瞵瞷矰 璈皞, 腠腶舝 翣聜蒢 魵 潧潣, 慖摲摓 橍殧澞 蟷蠉蟼 摮 嗢嗂 誙賗跿 磏磑禠 蝩覤 穊 鷕黰戄 鼀齕櫌 殔湝 緦, 緁 瘱瘵瘲 鸃鼞欘 窞綆腤 嗼嗹 輷邆 壿 櫱瀯灂 鶭黮齥 鏙闛颾, 眊砎粁 硻禂稢 薢蟌 鋈, 榎榯槄 墂嫫嵾 毄滱漮 豥 髟偛 + +掭掝 暲 瞵瞷矰 鬄鵊鵙 鍎鞚韕, 齞齝囃 脬舑莕 蔍 嫳嫬 絼綒 縸縩薋 毊灚襳 珝砯砨 嵧 裌覅詵 崸嵀惉 慛 碞碠 蒮 橁橖澭 摨敹暯 罫蓱蒆 嵥嵧 蟷蠉 滆 櫱瀯灂 鶟儹巏 瘑睯碫 + +滈 簎艜薤 廑憀慡 鑴鱱爧 屼汆, 歅 彔抳 鏾鐇闠 桏毢涒 垽娭屔 磝磢磭 袟袘觕 鍌鍗鍷 鋈 氠洷, 棳棔 雈靮傿 臡虈觿 氃濈瀄 槄 橀槶澉 麷劻穋 嘽 簅縭, 狑玝 垥娀庣 僤凘墈 岯岪弨 摲, 馺骱魡 抩枎殀 迗俀侹 蓪 錛鍆 蔰 暯樧 璸瓁穟 瘑睯碫 濍燂犝, 犵艿邔 獧瞝瞣 馻噈嫶 蝢褗 僣, 嬨嶵 壿 蠝襭譸 痑祣筇 觛詏貁 蜙 珶珸珿 濷瓂癚 箑箖 嗼嗹墋 峷敊浭 阰刲 鄜, 柦柋牬 寁崏庲 礯籔羻 鋍鞎 鉾 跠跬 蜸 勯噚嶢 礌簨繖 醳鏻鐆 + +蟷蠉蟼 熩熝犚 摓 髽鮛 顤鰩鷎 駍駔鳿 鸃鼞欘 褅 牬玾 殍涾烰 誽賚賧 鴸鼢曘 搋朠 殟 蟼襛 溔 嶵嶯幯 蒘蝆蜪, 蟣襋 溿煔煃 銇韎餀 蹸蹪鏂 摮 踸躽 踣 廦廥彋 鼀齕櫌, 靾鞂 虥諰諨 婸媥媕 毄滱漮 魆 蒛 裧頖 鍆錌雔 枅杺枙 堔埧娾, 蓂蓌蓖 噾噿嚁 洷炟砏 砎粁 鋱, 嬼懫 杍肜阰 麷劻穋 蔊蓴蔖 豥 + +暕 忀瀸蘌 褣諝趥 髽鮛 滍 噾噿 顤鰩鷎 逯郹酟 樏殣氀 煻獌 蚔趵郚 枲柊氠 鄃鈌鈅 暕, 禖穊稯 鄨鎷闒 鏾鐇闠 蒝蒧 誙 賌輈鄍 鶊鵱鶆 毊灚襳 珋疧 滘 瀗犡礝 簻臗藱 駔鳿 磑, 墐 圩芰敔 婂崥崣 溹溦滜 鍗鍷 diff --git a/collector/compile-benchmarks/html5ever-0.31.0/data/bench/lipsum.html b/collector/compile-benchmarks/html5ever-0.31.0/data/bench/lipsum.html new file mode 100644 index 000000000..27dc14aff --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/data/bench/lipsum.html @@ -0,0 +1,40 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer eu arcu varius, fringilla nisi non, pulvinar lorem. Nam et sollicitudin nisi, eget tempus sapien. Suspendisse ac libero velit. Proin semper lacinia posuere. Morbi sollicitudin lacinia urna, eget aliquet risus lobortis sit amet. Fusce rhoncus sodales mauris, a rutrum erat elementum id. Integer nec sapien sit amet nisl convallis vehicula eu eu augue. Etiam nec elit ac nibh lacinia porta. Integer dapibus feugiat magna, eget varius ante vestibulum vel. Vestibulum vitae felis quis est tristique varius quis eget libero. Nullam tincidunt magna eros, nec luctus ante pretium at. Aenean laoreet justo vitae risus fringilla convallis. In malesuada scelerisque lorem, sed luctus tortor varius at. Morbi odio ligula, commodo eu sodales vitae, bibendum eget leo. In odio est, laoreet sit amet eleifend at, placerat in elit. + +Nullam ac viverra elit. Vestibulum et massa vel justo bibendum imperdiet. Donec elementum vitae nibh sit amet pellentesque. Ut id fringilla sem, in tincidunt quam. In a dui dignissim, gravida magna in, porta ante. Integer adipiscing porta risus. Nulla facilisi. Cras erat leo, tempor a ligula in, posuere ullamcorper nulla. Maecenas id auctor elit, imperdiet sagittis augue. Curabitur consectetur suscipit lorem porta sollicitudin. Etiam turpis orci, eleifend eu felis in, placerat consequat est. Sed ultrices, tellus ut volutpat venenatis, metus lectus malesuada diam, id ornare risus lectus sed massa. Vivamus mauris diam, lobortis ut interdum eget, porta a elit. Suspendisse potenti. + +Donec tincidunt nisi sed mollis feugiat. Mauris ultricies risus non eros feugiat tempor. In aliquam ut nunc id tempor. Curabitur vel elit dolor. Mauris ullamcorper tortor ac nisl feugiat, quis gravida nisl ullamcorper. Pellentesque a ligula quis erat rutrum sollicitudin in a metus. Aliquam ligula massa, cursus in libero a, blandit feugiat tortor. In ac auctor lorem. Ut faucibus leo nec egestas tristique. + +Nulla adipiscing consectetur odio, a iaculis eros aliquam at. Nullam dapibus ac ante et convallis. Phasellus tempor arcu velit. Donec adipiscing neque eu molestie mattis. Vestibulum id elit fringilla, ultrices orci eu, rhoncus purus. Mauris ornare nisi massa, et luctus tortor tincidunt vel. Maecenas eu ultrices enim, et varius est. Integer ipsum nunc, suscipit eu dapibus ac, ornare vitae sapien. Vestibulum posuere, nulla sed dictum tempus, magna metus commodo turpis, a aliquet orci tellus eu lectus. Mauris nulla magna, malesuada vitae iaculis ut, facilisis varius sem. In tristique sapien urna, et tristique dolor lacinia non. Suspendisse eu tincidunt eros. Pellentesque dignissim elit vitae purus auctor, non malesuada dolor scelerisque. + +Cras commodo tortor at risus ornare euismod a et risus. Sed rutrum, justo vel mollis condimentum, mi elit consectetur mi, non ultricies quam orci mollis sapien. Donec tincidunt, lacus molestie porttitor elementum, enim ligula hendrerit lacus, quis porttitor magna velit sed nisi. Quisque pretium eros id sem posuere consequat id sit amet nunc. Fusce pulvinar commodo ipsum, quis congue tellus faucibus eu. Sed bibendum dolor vitae ante porttitor pretium. Integer id malesuada eros, sed tristique metus. Nunc vitae turpis eu risus sodales vestibulum quis ut magna. In eget metus elit. Donec gravida libero risus, eget tempus erat varius eu. Vestibulum id dignissim sapien. Fusce pretium posuere lacus. Aliquam ac arcu sollicitudin, lacinia tellus vitae, pellentesque tortor. Mauris viverra velit ac lacus egestas sagittis. Duis auctor interdum tincidunt. Aenean eu ullamcorper sapien, sit amet sollicitudin magna. + +Nam vel lorem a quam sollicitudin fringilla sit amet quis nibh. Quisque commodo molestie augue. Vivamus ut erat aliquet, gravida ante at, suscipit arcu. Fusce nulla massa, lobortis vel dictum non, vehicula ac lorem. Etiam blandit sodales urna, at aliquet libero dapibus a. Cras odio mauris, porta at enim vitae, aliquam tincidunt libero. Praesent at tortor eu eros cursus consequat vel non elit. Mauris risus urna, sagittis eget turpis eu, malesuada semper nisl. Nunc posuere placerat ligula, in tristique urna pharetra et. Duis consectetur mauris nulla. Etiam auctor tincidunt molestie. Fusce eu faucibus diam, nec fermentum felis. Curabitur non lacinia quam, non luctus neque. Morbi sed ultrices diam. + +Fusce accumsan nisl sed nibh fringilla euismod. In ut arcu cursus erat imperdiet porttitor. Pellentesque tempus, nisi quis viverra convallis, eros sem dapibus magna, ut aliquet quam urna vitae dolor. Aenean id tortor turpis. Etiam lacinia arcu lorem, in consectetur arcu placerat sed. Duis non est ornare, dictum mi sit amet, cursus nunc. Suspendisse at venenatis massa. Etiam eget lorem diam. Donec tristique sapien at scelerisque porta. Aenean ornare ligula sed nibh gravida, vel commodo erat ultrices. Donec id enim purus. Vivamus malesuada tristique sapien id tempus. Morbi nec nunc dolor. + +Aliquam molestie turpis cursus blandit blandit. Integer imperdiet ullamcorper arcu, a fermentum nisi. Cras hendrerit quam id mollis elementum. Etiam ut erat ac leo posuere aliquet eget non tortor. Nam vel velit sed dui tincidunt gravida eget eget risus. Suspendisse adipiscing sed nulla vel molestie. Aliquam suscipit, sem sed volutpat sagittis, magna enim feugiat erat, pharetra feugiat magna neque a ante. Duis at metus eget leo congue molestie. Vivamus id massa ornare, rutrum ante nec, ullamcorper lacus. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Vestibulum lobortis arcu eu arcu hendrerit convallis. Integer mollis velit at ante consequat, eu pharetra erat venenatis. Integer tincidunt sit amet massa vel hendrerit. Morbi malesuada facilisis augue sed congue. Phasellus porttitor vel mi eu imperdiet. Aenean tincidunt, massa et tristique mollis, nisl metus vulputate est, quis sollicitudin metus ipsum vel felis. + +Suspendisse nec feugiat dui. Proin nec lorem semper, dignissim leo et, suscipit turpis. In posuere sem ut blandit scelerisque. Fusce vel ultricies augue, adipiscing pretium lacus. Mauris ac dui non odio convallis pellentesque. Curabitur posuere nec odio ut sodales. Morbi varius risus lacinia, convallis mauris in, tristique turpis. + +Vivamus hendrerit justo augue, et molestie ligula aliquam ac. Nunc nec vehicula felis. Donec quam lacus, commodo sollicitudin aliquet eu, aliquam ut leo. Donec vulputate arcu urna, in molestie orci faucibus non. Praesent ut ullamcorper ante. Quisque sollicitudin libero in arcu gravida, quis scelerisque tortor volutpat. Nulla ornare mi ac odio sagittis rutrum. Sed quis sagittis felis. Praesent bibendum orci sed risus elementum, malesuada posuere massa condimentum. Sed velit nunc, pulvinar eu feugiat at, ultrices eu odio. Mauris lacinia ut odio eget ornare. Nullam commodo mollis lorem, ac vehicula justo tristique a. + +Morbi est ipsum, egestas a urna sed, aliquet tempus ipsum. In eget fermentum libero. Nullam a sodales dui. Nam imperdiet condimentum luctus. Morbi bibendum at nulla sed aliquam. Quisque nibh nibh, sollicitudin non ullamcorper commodo, viverra non metus. Suspendisse eleifend turpis massa. Cras tortor metus, rutrum sit amet tellus a, sodales suscipit eros. Sed in vulputate ligula. Integer posuere velit sed nisl tristique suscipit. Quisque bibendum eleifend enim in sollicitudin. Phasellus tincidunt orci pretium, molestie felis eu, sodales metus. + +Vestibulum consectetur orci ut blandit aliquet. Sed posuere cursus lacus vestibulum posuere. Phasellus ut risus sem. Vivamus et purus non felis pellentesque lacinia. Phasellus aliquam, diam eget vestibulum lobortis, purus tortor porttitor eros, vitae auctor lorem velit a turpis. Integer eu metus vel nisi porta lobortis sollicitudin eget arcu. Maecenas ac blandit dolor. In et sapien ornare, dignissim nulla quis, tempor odio. + +Ut nec quam ligula. Ut euismod, nisi nec iaculis faucibus, nisi arcu dignissim neque, a fringilla dolor tellus ut arcu. Curabitur iaculis rhoncus orci sed fermentum. Cras augue elit, eleifend sodales pellentesque ac, varius bibendum nulla. Etiam id diam non purus porta lobortis. Cras fringilla metus in ipsum laoreet placerat. Integer vel quam nec libero varius mattis in non nibh. + +Pellentesque adipiscing feugiat neque, vitae imperdiet dui. Duis pharetra elit a dictum laoreet. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Nulla vulputate malesuada nisi, vel egestas nulla mollis ut. Nunc faucibus pharetra leo ac ultricies. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus in odio a magna convallis molestie ut at mauris. Morbi bibendum id dui id imperdiet. Curabitur volutpat et erat quis venenatis. Integer tincidunt et felis sed rutrum. Donec vitae porttitor enim. Sed nisi nunc, auctor ac ullamcorper quis, eleifend id metus. + +Morbi felis est, tincidunt at eros at, interdum tempor tortor. Nam et semper metus. Vivamus lacinia pulvinar magna, a lacinia ligula condimentum vitae. Donec vitae ullamcorper diam. Aenean auctor mollis tincidunt. Mauris hendrerit eros quis nulla posuere, non mattis tellus venenatis. Fusce et ligula nec arcu consequat pulvinar. Maecenas sagittis odio justo, at ultrices velit aliquet quis. In hac habitasse platea dictumst. Suspendisse viverra nunc vitae lectus bibendum, vel pretium arcu pretium. Curabitur iaculis condimentum magna ac rutrum. Aenean placerat massa nunc, id vehicula magna vulputate eget. Integer dignissim nunc in enim bibendum consequat vitae id leo. Mauris quis aliquam quam. Suspendisse vel fringilla purus. Mauris sodales dui vitae lacus pellentesque tincidunt a eget nunc. + +Nullam imperdiet vestibulum magna nec dictum. Vestibulum scelerisque vestibulum congue. Phasellus fermentum pulvinar elit, eget fringilla arcu vestibulum sed. Mauris pretium nulla in consectetur cursus. Cras malesuada est vulputate hendrerit bibendum. Aenean a tristique diam, ac convallis ipsum. Nunc ac justo ut ante tristique pulvinar. Donec ornare leo sed iaculis rutrum. Integer tincidunt vestibulum massa scelerisque accumsan. Maecenas malesuada, orci at tincidunt faucibus, ipsum velit condimentum odio, vitae cursus risus justo vel orci. Interdum et malesuada fames ac ante ipsum primis in faucibus. Vivamus eu tincidunt leo. Nam a faucibus ipsum, in convallis ligula. Fusce urna lorem, iaculis ut pharetra a, laoreet a mauris. Maecenas molestie justo enim, vitae tincidunt nulla dictum quis. + +Ut ac purus ut velit feugiat tincidunt nec sit amet lorem. Mauris nulla sapien, rhoncus a condimentum et, tincidunt ut enim. Nullam eu rhoncus ante. Proin eget erat est. Vivamus suscipit fringilla metus, ut scelerisque urna. Vivamus id porta nibh, ac tincidunt nisl. Vivamus commodo tincidunt turpis a molestie. Phasellus nec interdum enim. Cras accumsan tristique massa. + +Cras vitae blandit dolor. Sed purus sem, pharetra sed orci eu, fermentum porttitor magna. Morbi dictum gravida sodales. Pellentesque varius non quam in ullamcorper. Sed in mauris sit amet sapien tempus gravida. Aliquam suscipit nulla a risus ullamcorper, et pharetra leo pharetra. Pellentesque neque lectus, molestie et eros id, consequat sagittis arcu. Nullam suscipit ipsum id lacus tincidunt sollicitudin. Fusce eget leo non massa tempor scelerisque ut a enim. Vestibulum a elementum ligula. Aliquam vehicula semper nibh nec imperdiet. Interdum et malesuada fames ac ante ipsum primis in faucibus. Etiam pretium ante eget lectus rutrum auctor. + +Sed pharetra quam metus. Aenean ac rutrum arcu. Donec sit amet pharetra nulla, vitae porttitor eros. Nullam accumsan cursus dolor, ut sodales magna tincidunt quis. Quisque egestas pellentesque velit id fringilla. Duis vel nisi libero. Vivamus ultrices ligula vel tempor lacinia. Cras dictum ut nunc vel suscipit. Duis convallis tortor varius consectetur tempor. Maecenas sed pharetra quam. Nunc malesuada risus justo, et vehicula quam placerat at. Vestibulum non orci eu felis viverra convallis. + +Nulla accumsan ultrices ligula, id commodo odio interdum sed. Fusce sit amet varius tortor. Integer non mattis eros. Curabitur vulputate massa non ante lacinia sodales. Aenean a feugiat ligula. Fusce ultricies molestie lectus auctor dignissim. Duis eu lorem feugiat, varius quam vel, volutpat magna. Pellentesque nec nisl ut lorem interdum condimentum scelerisque eu purus. Vestibulum porttitor elementum lectus quis lobortis. Vestibulum non sem ultricies, elementum risus non, aliquet ipsum. Phasellus pellentesque lacinia purus et tristique. Aenean lacinia, mi vel rutrum dapibus, nibh lacus hendrerit velit, ac faucibus massa erat sodales dui. Etiam in enim varius, auctor risus vel, blandit quam. + diff --git a/collector/compile-benchmarks/html5ever-0.31.0/data/bench/medium-fragment.html b/collector/compile-benchmarks/html5ever-0.31.0/data/bench/medium-fragment.html new file mode 100644 index 000000000..570bef2ff --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/data/bench/medium-fragment.html @@ -0,0 +1,24 @@ +

History[edit]

+

By early 1992 the search was on for a good byte-stream encoding of multi-byte character sets. The draft ISO 10646 standard contained a non-required annex called UTF-1 + that provided a byte-stream encoding of its 32-bit code points. This +encoding was not satisfactory on performance grounds, but did introduce +the notion that bytes in the range of 0–127 continue representing the +ASCII characters in UTF, thereby providing backward compatibility with +ASCII.

+

In July 1992, the X/Open committee XoJIG was looking for a better encoding. Dave Prosser of Unix System Laboratories + submitted a proposal for one that had faster implementation +characteristics and introduced the improvement that 7-bit ASCII +characters would only represent themselves; all multibyte +sequences would include only bytes where the high bit was set. This +original proposal, FSS-UTF (File System Safe UCS Transformation Format), + was similar in concept to UTF-8, but lacked the crucial property of self-synchronization.[7][8]

+

In August 1992, this proposal was circulated by an IBM X/Open representative to interested parties. Ken Thompson of the Plan 9 operating system group at Bell Labs + then made a small but crucial modification to the encoding, making it +very slightly less bit-efficient than the previous proposal but allowing + it to be self-synchronizing, + meaning that it was no longer necessary to read from the beginning of +the string to find code point boundaries. Thompson's design was outlined + on September 2, 1992, on a placemat in a New Jersey diner with Rob Pike. In the following days, Pike and Thompson implemented it and updated Plan 9 to use it throughout, and then communicated their success back to X/Open.[7]

+

UTF-8 was first officially presented at the USENIX conference in San Diego, from January 25 to 29, 1993.

+

Google reported that in 2008 UTF-8 (misleadingly labelled "Unicode") became the most common encoding for HTML files.[9][10]

+

Description[edit]

diff --git a/collector/compile-benchmarks/html5ever-0.31.0/data/bench/small-fragment.html b/collector/compile-benchmarks/html5ever-0.31.0/data/bench/small-fragment.html new file mode 100644 index 000000000..a0b9643e5 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/data/bench/small-fragment.html @@ -0,0 +1,7 @@ +

In July 1992, the X/Open committee XoJIG was looking for a better encoding. Dave Prosser of Unix System Laboratories + submitted a proposal for one that had faster implementation +characteristics and introduced the improvement that 7-bit ASCII +characters would only represent themselves; all multibyte +sequences would include only bytes where the high bit was set. This +original proposal, FSS-UTF (File System Safe UCS Transformation Format), + was similar in concept to UTF-8, but lacked the crucial property of self-synchronization. diff --git a/collector/compile-benchmarks/html5ever-0.31.0/data/bench/strong.html b/collector/compile-benchmarks/html5ever-0.31.0/data/bench/strong.html new file mode 100644 index 000000000..0ef665e5d --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/data/bench/strong.html @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/collector/compile-benchmarks/html5ever-0.31.0/data/bench/tiny-fragment.html b/collector/compile-benchmarks/html5ever-0.31.0/data/bench/tiny-fragment.html new file mode 100644 index 000000000..7ce535433 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/data/bench/tiny-fragment.html @@ -0,0 +1 @@ +

Hello, world!

diff --git a/collector/compile-benchmarks/html5ever-0.31.0/examples/arena.rs b/collector/compile-benchmarks/html5ever-0.31.0/examples/arena.rs new file mode 100644 index 000000000..36854caf1 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/examples/arena.rs @@ -0,0 +1,356 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate html5ever; +extern crate typed_arena; + +use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; +use html5ever::tendril::{StrTendril, TendrilSink}; +use html5ever::{parse_document, Attribute, QualName}; +use std::borrow::Cow; +use std::cell::{Cell, RefCell}; +use std::collections::HashSet; +use std::io::{self, Read}; +use std::ptr; + +/// By using our Sink type, the arena is filled with parsed HTML. +fn html5ever_parse_slice_into_arena<'a>(bytes: &[u8], arena: Arena<'a>) -> Ref<'a> { + let sink = Sink { + arena, + document: arena.alloc(Node::new(NodeData::Document)), + quirks_mode: Cell::new(QuirksMode::NoQuirks), + }; + + parse_document(sink, Default::default()) + .from_utf8() + .one(bytes) +} + +type Arena<'arena> = &'arena typed_arena::Arena>; +type Ref<'arena> = &'arena Node<'arena>; +type Link<'arena> = Cell>>; + +/// Sink struct is responsible for handling how the data that comes out of the HTML parsing +/// unit (TreeBuilder in our case) is handled. +struct Sink<'arena> { + arena: Arena<'arena>, + document: Ref<'arena>, + quirks_mode: Cell, +} + +/// DOM node which contains links to other nodes in the tree. +pub struct Node<'arena> { + parent: Link<'arena>, + next_sibling: Link<'arena>, + previous_sibling: Link<'arena>, + first_child: Link<'arena>, + last_child: Link<'arena>, + data: NodeData<'arena>, +} + +/// HTML node data which can be an element, a comment, a string, a DOCTYPE, etc... +pub enum NodeData<'arena> { + Document, + Doctype { + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril, + }, + Text { + contents: RefCell, + }, + Comment { + contents: StrTendril, + }, + Element { + name: QualName, + attrs: RefCell>, + template_contents: Option>, + mathml_annotation_xml_integration_point: bool, + }, + ProcessingInstruction { + target: StrTendril, + contents: StrTendril, + }, +} + +impl<'arena> Node<'arena> { + fn new(data: NodeData<'arena>) -> Self { + Node { + parent: Cell::new(None), + previous_sibling: Cell::new(None), + next_sibling: Cell::new(None), + first_child: Cell::new(None), + last_child: Cell::new(None), + data, + } + } + + fn detach(&self) { + let parent = self.parent.take(); + let previous_sibling = self.previous_sibling.take(); + let next_sibling = self.next_sibling.take(); + + if let Some(next_sibling) = next_sibling { + next_sibling.previous_sibling.set(previous_sibling); + } else if let Some(parent) = parent { + parent.last_child.set(previous_sibling); + } + + if let Some(previous_sibling) = previous_sibling { + previous_sibling.next_sibling.set(next_sibling); + } else if let Some(parent) = parent { + parent.first_child.set(next_sibling); + } + } + + fn append(&'arena self, new_child: &'arena Self) { + new_child.detach(); + new_child.parent.set(Some(self)); + if let Some(last_child) = self.last_child.take() { + new_child.previous_sibling.set(Some(last_child)); + debug_assert!(last_child.next_sibling.get().is_none()); + last_child.next_sibling.set(Some(new_child)); + } else { + debug_assert!(self.first_child.get().is_none()); + self.first_child.set(Some(new_child)); + } + self.last_child.set(Some(new_child)); + } + + fn insert_before(&'arena self, new_sibling: &'arena Self) { + new_sibling.detach(); + new_sibling.parent.set(self.parent.get()); + new_sibling.next_sibling.set(Some(self)); + if let Some(previous_sibling) = self.previous_sibling.take() { + new_sibling.previous_sibling.set(Some(previous_sibling)); + debug_assert!(ptr::eq::( + previous_sibling.next_sibling.get().unwrap(), + self + )); + previous_sibling.next_sibling.set(Some(new_sibling)); + } else if let Some(parent) = self.parent.get() { + debug_assert!(ptr::eq::(parent.first_child.get().unwrap(), self)); + parent.first_child.set(Some(new_sibling)); + } + self.previous_sibling.set(Some(new_sibling)); + } +} + +impl<'arena> Sink<'arena> { + fn new_node(&self, data: NodeData<'arena>) -> Ref<'arena> { + self.arena.alloc(Node::new(data)) + } + + fn append_common(&self, child: NodeOrText>, previous: P, append: A) + where + P: FnOnce() -> Option>, + A: FnOnce(Ref<'arena>), + { + let new_node = match child { + NodeOrText::AppendText(text) => { + // Append to an existing Text node if we have one. + if let Some(&Node { + data: NodeData::Text { ref contents }, + .. + }) = previous() + { + contents.borrow_mut().push_tendril(&text); + return; + } + self.new_node(NodeData::Text { + contents: RefCell::new(text), + }) + }, + NodeOrText::AppendNode(node) => node, + }; + + append(new_node) + } +} + +/// By implementing the TreeSink trait we determine how the data from the tree building step +/// is processed. In our case, our data is allocated in the arena and added to the Node data +/// structure. +/// +/// For deeper understating of each function go to the TreeSink declaration. +impl<'arena> TreeSink for Sink<'arena> { + type Handle = Ref<'arena>; + type Output = Ref<'arena>; + type ElemName<'a> + = &'a QualName + where + Self: 'a; + + fn finish(self) -> Ref<'arena> { + self.document + } + + fn parse_error(&self, _: Cow<'static, str>) {} + + fn get_document(&self) -> Ref<'arena> { + self.document + } + + fn set_quirks_mode(&self, mode: QuirksMode) { + self.quirks_mode.set(mode); + } + + fn same_node(&self, x: &Ref<'arena>, y: &Ref<'arena>) -> bool { + ptr::eq::(*x, *y) + } + + fn elem_name(&self, target: &Ref<'arena>) -> Self::ElemName<'_> { + match target.data { + NodeData::Element { ref name, .. } => name, + _ => panic!("not an element!"), + } + } + + fn get_template_contents(&self, target: &Ref<'arena>) -> Ref<'arena> { + if let NodeData::Element { + template_contents: Some(contents), + .. + } = target.data + { + contents + } else { + panic!("not a template element!") + } + } + + fn is_mathml_annotation_xml_integration_point(&self, target: &Ref<'arena>) -> bool { + if let NodeData::Element { + mathml_annotation_xml_integration_point, + .. + } = target.data + { + mathml_annotation_xml_integration_point + } else { + panic!("not an element!") + } + } + + fn create_element( + &self, + name: QualName, + attrs: Vec, + flags: ElementFlags, + ) -> Ref<'arena> { + self.new_node(NodeData::Element { + name, + attrs: RefCell::new(attrs), + template_contents: if flags.template { + Some(self.new_node(NodeData::Document)) + } else { + None + }, + mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point, + }) + } + + fn create_comment(&self, text: StrTendril) -> Ref<'arena> { + self.new_node(NodeData::Comment { contents: text }) + } + + fn create_pi(&self, target: StrTendril, data: StrTendril) -> Ref<'arena> { + self.new_node(NodeData::ProcessingInstruction { + target, + contents: data, + }) + } + + fn append(&self, parent: &Ref<'arena>, child: NodeOrText>) { + self.append_common( + child, + || parent.last_child.get(), + |new_node| parent.append(new_node), + ) + } + + fn append_before_sibling(&self, sibling: &Ref<'arena>, child: NodeOrText>) { + self.append_common( + child, + || sibling.previous_sibling.get(), + |new_node| sibling.insert_before(new_node), + ) + } + + fn append_based_on_parent_node( + &self, + element: &Ref<'arena>, + prev_element: &Ref<'arena>, + child: NodeOrText>, + ) { + if element.parent.get().is_some() { + self.append_before_sibling(element, child) + } else { + self.append(prev_element, child) + } + } + + fn append_doctype_to_document( + &self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril, + ) { + self.document.append(self.new_node(NodeData::Doctype { + name, + public_id, + system_id, + })) + } + + fn add_attrs_if_missing(&self, target: &Ref<'arena>, attrs: Vec) { + let mut existing = if let NodeData::Element { ref attrs, .. } = target.data { + attrs.borrow_mut() + } else { + panic!("not an element") + }; + + let existing_names = existing + .iter() + .map(|e| e.name.clone()) + .collect::>(); + existing.extend( + attrs + .into_iter() + .filter(|attr| !existing_names.contains(&attr.name)), + ); + } + + fn remove_from_parent(&self, target: &Ref<'arena>) { + target.detach() + } + + fn reparent_children(&self, node: &Ref<'arena>, new_parent: &Ref<'arena>) { + let mut next_child = node.first_child.get(); + while let Some(child) = next_child { + debug_assert!(ptr::eq::(child.parent.get().unwrap(), *node)); + next_child = child.next_sibling.get(); + new_parent.append(child) + } + } +} + +/// In this example an "arena" is created and filled with the DOM nodes. +/// "Arena" is a type of allocation in which a block of memory is allocated +/// and later filled with data, DOM nodes in this case. When the arena is deallocated +/// it is destroyed with all of its items. +/// +/// Further info about arena: https://docs.rs/typed-arena/latest/typed_arena/ +fn main() { + // Read HTML from the standard input + let mut bytes = Vec::new(); + io::stdin().read_to_end(&mut bytes).unwrap(); + + let arena = typed_arena::Arena::new(); + html5ever_parse_slice_into_arena(&bytes, &arena); +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/examples/capi/tokenize.c b/collector/compile-benchmarks/html5ever-0.31.0/examples/capi/tokenize.c new file mode 100644 index 000000000..8c8cdd464 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/examples/capi/tokenize.c @@ -0,0 +1,74 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#include + +#include "html5ever.h" + +void put_str(const char *x) { + fputs(x, stdout); +} + +void put_buf(struct h5e_buf text) { + fwrite(text.data, text.len, 1, stdout); +} + +void do_chars(void *user, struct h5e_buf text) { + put_str("CHARS : "); + put_buf(text); + put_str("\n"); +} + +void do_start_tag(void *user, struct h5e_buf name, int self_closing, size_t num_attrs) { + put_str("TAG : <"); + put_buf(name); + if (self_closing) { + putchar('/'); + } + put_str(">\n"); +} + +void do_tag_attr(void *user, struct h5e_buf name, struct h5e_buf value) { + put_str(" ATTR: "); + put_buf(name); + put_str("=\""); + put_buf(value); + put_str("\"\n"); +} + +void do_end_tag(void *user, struct h5e_buf name) { + put_str("TAG : \n"); +} + +struct h5e_token_ops ops = { + .do_chars = do_chars, + .do_start_tag = do_start_tag, + .do_tag_attr = do_tag_attr, + .do_end_tag = do_end_tag, +}; + +struct h5e_token_sink sink = { + .ops = &ops, + .user = NULL, +}; + +int main(int argc, char *argv[]) { + if (argc < 2) { + printf("Usage: %s 'HTML fragment'\n", argv[0]); + return 1; + } + + struct h5e_tokenizer *tok = h5e_tokenizer_new(&sink); + h5e_tokenizer_feed(tok, h5e_buf_from_cstr(argv[1])); + h5e_tokenizer_end(tok); + h5e_tokenizer_free(tok); + return 0; +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/examples/noop-tokenize.rs b/collector/compile-benchmarks/html5ever-0.31.0/examples/noop-tokenize.rs new file mode 100644 index 000000000..a95404df7 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/examples/noop-tokenize.rs @@ -0,0 +1,47 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// Run a single benchmark once. For use with profiling tools. + +extern crate html5ever; + +use std::cell::RefCell; +use std::io; + +use html5ever::tendril::*; +use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer}; + +/// In our case, our sink only contains a tokens vector +struct Sink(RefCell>); + +impl TokenSink for Sink { + type Handle = (); + + /// Each processed token will be handled by this method + fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<()> { + self.0.borrow_mut().push(token); + TokenSinkResult::Continue + } +} + +/// In this example we implement the TokenSink trait which lets us implement how each +/// parsed token is treated. In our example we take each token and insert it into a vector. +fn main() { + // Read HTML from standard input + let mut chunk = ByteTendril::new(); + io::stdin().read_to_tendril(&mut chunk).unwrap(); + + let input = BufferQueue::default(); + input.push_back(chunk.try_reinterpret().unwrap()); + + let tok = Tokenizer::new(Sink(RefCell::new(Vec::new())), Default::default()); + let _ = tok.feed(&input); + assert!(input.is_empty()); + tok.end(); +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/examples/noop-tree-builder.rs b/collector/compile-benchmarks/html5ever-0.31.0/examples/noop-tree-builder.rs new file mode 100644 index 000000000..ee09155a3 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/examples/noop-tree-builder.rs @@ -0,0 +1,132 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#[macro_use] +extern crate html5ever; + +use std::borrow::Cow; +use std::cell::{Cell, RefCell}; +use std::collections::HashMap; +use std::io; + +use html5ever::parse_document; +use html5ever::tendril::*; +use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; +use html5ever::{Attribute, ExpandedName, QualName}; + +struct Sink { + next_id: Cell, + names: RefCell>, +} + +impl Sink { + fn get_id(&self) -> usize { + let id = self.next_id.get(); + self.next_id.set(id + 2); + id + } +} + +/// By implementing the TreeSink trait we determine how the data from the tree building step +/// is processed. In this case the DOM elements are written into the "names" hashmap. +/// +/// For deeper understating of each function go to the TreeSink declaration. +impl TreeSink for Sink { + type Handle = usize; + type Output = Self; + type ElemName<'a> = ExpandedName<'a>; + fn finish(self) -> Self { + self + } + + fn get_document(&self) -> usize { + 0 + } + + fn get_template_contents(&self, target: &usize) -> usize { + if let Some(expanded_name!(html "template")) = + self.names.borrow().get(target).map(|n| n.expanded()) + { + target + 1 + } else { + panic!("not a template element") + } + } + + fn same_node(&self, x: &usize, y: &usize) -> bool { + x == y + } + + fn elem_name(&self, target: &usize) -> ExpandedName { + self.names + .borrow() + .get(target) + .expect("not an element") + .expanded() + } + + fn create_element(&self, name: QualName, _: Vec, _: ElementFlags) -> usize { + let id = self.get_id(); + // N.B. We intentionally leak memory here to minimize the implementation complexity + // of this example code. A real implementation would either want to use a real + // real DOM tree implentation, or else use an arena as the backing store for + // memory used by the parser. + self.names + .borrow_mut() + .insert(id, Box::leak(Box::new(name))); + id + } + + fn create_comment(&self, _text: StrTendril) -> usize { + self.get_id() + } + + #[allow(unused_variables)] + fn create_pi(&self, target: StrTendril, value: StrTendril) -> usize { + unimplemented!() + } + + fn append_before_sibling(&self, _sibling: &usize, _new_node: NodeOrText) {} + + fn append_based_on_parent_node( + &self, + _element: &usize, + _prev_element: &usize, + _new_node: NodeOrText, + ) { + } + + fn parse_error(&self, _msg: Cow<'static, str>) {} + fn set_quirks_mode(&self, _mode: QuirksMode) {} + fn append(&self, _parent: &usize, _child: NodeOrText) {} + + fn append_doctype_to_document(&self, _: StrTendril, _: StrTendril, _: StrTendril) {} + fn add_attrs_if_missing(&self, target: &usize, _attrs: Vec) { + assert!(self.names.borrow().contains_key(target), "not an element"); + } + fn remove_from_parent(&self, _target: &usize) {} + fn reparent_children(&self, _node: &usize, _new_parent: &usize) {} + fn mark_script_already_started(&self, _node: &usize) {} +} + +/// In this example we implement the TreeSink trait which takes each parsed elements and insert +/// it to a hashmap, while each element is given a numeric id. +fn main() { + let sink = Sink { + next_id: Cell::new(1), + names: RefCell::new(HashMap::new()), + }; + + // Read HTML from the standard input and parse it + let stdin = io::stdin(); + parse_document(sink, Default::default()) + .from_utf8() + .read_from(&mut stdin.lock()) + .unwrap(); +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/examples/print-tree-actions.rs b/collector/compile-benchmarks/html5ever-0.31.0/examples/print-tree-actions.rs new file mode 100644 index 000000000..dfa0aedd0 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/examples/print-tree-actions.rs @@ -0,0 +1,185 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#[macro_use] +extern crate html5ever; + +use std::borrow::Cow; +use std::cell::{Cell, Ref, RefCell}; +use std::collections::HashMap; +use std::io; + +use html5ever::parse_document; +use html5ever::tendril::*; +use html5ever::tree_builder::{ + AppendNode, AppendText, ElementFlags, NodeOrText, QuirksMode, TreeSink, +}; +use html5ever::{Attribute, QualName}; + +struct Sink { + next_id: Cell, + names: RefCell>, +} + +impl Sink { + fn get_id(&self) -> usize { + let id = self.next_id.get(); + self.next_id.set(id + 2); + id + } +} + +impl TreeSink for Sink { + type Handle = usize; + type Output = Self; + type ElemName<'a> = Ref<'a, QualName>; + fn finish(self) -> Self { + self + } + + fn parse_error(&self, msg: Cow<'static, str>) { + println!("Parse error: {msg}"); + } + + fn get_document(&self) -> usize { + 0 + } + + fn get_template_contents(&self, target: &usize) -> usize { + if let Some(expanded_name!(html "template")) = + self.names.borrow().get(target).map(|n| n.expanded()) + { + target + 1 + } else { + panic!("not a template element") + } + } + + fn set_quirks_mode(&self, mode: QuirksMode) { + println!("Set quirks mode to {mode:?}"); + } + + fn same_node(&self, x: &usize, y: &usize) -> bool { + x == y + } + + fn elem_name(&self, target: &usize) -> Self::ElemName<'_> { + Ref::map(self.names.borrow(), |map| { + *map.get(target).expect("not an element") + }) + } + + fn create_element(&self, name: QualName, _: Vec, _: ElementFlags) -> usize { + let id = self.get_id(); + println!("Created {name:?} as {id}"); + // N.B. We intentionally leak memory here to minimize the implementation complexity + // of this example code. A real implementation would either want to use a real + // real DOM tree implentation, or else use an arena as the backing store for + // memory used by the parser. + self.names + .borrow_mut() + .insert(id, Box::leak(Box::new(name))); + id + } + + fn create_comment(&self, text: StrTendril) -> usize { + let id = self.get_id(); + println!("Created comment \"{}\" as {}", text.escape_default(), id); + id + } + + #[allow(unused_variables)] + fn create_pi(&self, target: StrTendril, value: StrTendril) -> usize { + unimplemented!() + } + + fn append(&self, parent: &usize, child: NodeOrText) { + match child { + AppendNode(n) => println!("Append node {n} to {parent}"), + AppendText(t) => println!("Append text to {}: \"{}\"", parent, t.escape_default()), + } + } + + fn append_before_sibling(&self, sibling: &usize, new_node: NodeOrText) { + match new_node { + AppendNode(n) => println!("Append node {n} before {sibling}"), + AppendText(t) => println!("Append text before {}: \"{}\"", sibling, t.escape_default()), + } + } + + fn append_based_on_parent_node( + &self, + element: &Self::Handle, + _prev_element: &Self::Handle, + child: NodeOrText, + ) { + self.append_before_sibling(element, child); + } + + fn append_doctype_to_document( + &self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril, + ) { + println!("Append doctype: {name} {public_id} {system_id}"); + } + + fn add_attrs_if_missing(&self, target: &usize, attrs: Vec) { + assert!(self.names.borrow().contains_key(target), "not an element"); + println!("Add missing attributes to {target}:"); + for attr in attrs.into_iter() { + println!(" {:?} = {}", attr.name, attr.value); + } + } + + fn associate_with_form( + &self, + _target: &usize, + _form: &usize, + _nodes: (&usize, Option<&usize>), + ) { + // No form owner support. + } + + fn remove_from_parent(&self, target: &usize) { + println!("Remove {target} from parent"); + } + + fn reparent_children(&self, node: &usize, new_parent: &usize) { + println!("Move children from {node} to {new_parent}"); + } + + fn mark_script_already_started(&self, node: &usize) { + println!("Mark script {node} as already started"); + } + + fn set_current_line(&self, line_number: u64) { + println!("Set current line to {line_number}"); + } + + fn pop(&self, elem: &usize) { + println!("Popped element {elem}"); + } +} + +/// Same example as the "noop-tree-builder", but this time every function implemented in our +/// Sink object prints a log, so it's easier to get an understating of when each function is +/// called. +fn main() { + let sink = Sink { + next_id: Cell::new(1), + names: RefCell::new(HashMap::new()), + }; + let stdin = io::stdin(); + parse_document(sink, Default::default()) + .from_utf8() + .read_from(&mut stdin.lock()) + .unwrap(); +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/examples/tokenize.rs b/collector/compile-benchmarks/html5ever-0.31.0/examples/tokenize.rs new file mode 100644 index 000000000..ba984d8fd --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/examples/tokenize.rs @@ -0,0 +1,111 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate html5ever; + +use std::cell::Cell; +use std::io; + +use html5ever::tendril::*; +use html5ever::tokenizer::BufferQueue; +use html5ever::tokenizer::{CharacterTokens, EndTag, NullCharacterToken, StartTag, TagToken}; +use html5ever::tokenizer::{ + ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, +}; + +#[derive(Clone)] +struct TokenPrinter { + in_char_run: Cell, +} + +impl TokenPrinter { + fn is_char(&self, is_char: bool) { + match (self.in_char_run.get(), is_char) { + (false, true) => print!("CHAR : \""), + (true, false) => println!("\""), + _ => (), + } + self.in_char_run.set(is_char); + } + + fn do_char(&self, c: char) { + self.is_char(true); + print!("{}", c.escape_default().collect::()); + } +} + +impl TokenSink for TokenPrinter { + type Handle = (); + + fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<()> { + match token { + CharacterTokens(b) => { + for c in b.chars() { + self.do_char(c); + } + }, + NullCharacterToken => self.do_char('\0'), + TagToken(tag) => { + self.is_char(false); + // This is not proper HTML serialization, of course. + match tag.kind { + StartTag => print!("TAG : <\x1b[32m{}\x1b[0m", tag.name), + EndTag => print!("TAG : <\x1b[31m/{}\x1b[0m", tag.name), + } + for attr in tag.attrs.iter() { + print!( + " \x1b[36m{}\x1b[0m='\x1b[34m{}\x1b[0m'", + attr.name.local, attr.value + ); + } + if tag.self_closing { + print!(" \x1b[31m/\x1b[0m"); + } + println!(">"); + }, + ParseError(err) => { + self.is_char(false); + println!("ERROR: {err}"); + }, + _ => { + self.is_char(false); + println!("OTHER: {token:?}"); + }, + } + TokenSinkResult::Continue + } +} + +/// In this example we implement the TokenSink trait in such a way that each token is printed. +/// If a there's an error while processing a token it is printed as well. +fn main() { + let sink = TokenPrinter { + in_char_run: Cell::new(false), + }; + + // Read HTML from standard input + let mut chunk = ByteTendril::new(); + io::stdin().read_to_tendril(&mut chunk).unwrap(); + + let input = BufferQueue::default(); + input.push_back(chunk.try_reinterpret().unwrap()); + + let tok = Tokenizer::new( + sink, + TokenizerOpts { + profile: true, + ..Default::default() + }, + ); + let _ = tok.feed(&input); + + assert!(input.is_empty()); + tok.end(); + tok.sink.is_char(false); +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/perf-config.json b/collector/compile-benchmarks/html5ever-0.31.0/perf-config.json new file mode 100644 index 000000000..710581fa8 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/perf-config.json @@ -0,0 +1,4 @@ +{ + "artifact": "library", + "category": "primary" +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/src/driver.rs b/collector/compile-benchmarks/html5ever-0.31.0/src/driver.rs new file mode 100644 index 000000000..1f66ebcaa --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/src/driver.rs @@ -0,0 +1,137 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! High-level interface to the parser. + +use crate::buffer_queue::BufferQueue; +use crate::tokenizer::{Tokenizer, TokenizerOpts}; +use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink}; +use crate::{Attribute, QualName}; +use markup5ever::TokenizerResult; +use std::borrow::Cow; + +use crate::tendril; +use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder}; +use crate::tendril::StrTendril; + +/// All-encompassing options struct for the parser. +#[derive(Clone, Default)] +pub struct ParseOpts { + /// Tokenizer options. + pub tokenizer: TokenizerOpts, + + /// Tree builder options. + pub tree_builder: TreeBuilderOpts, +} + +/// Parse an HTML document +/// +/// The returned value implements `tendril::TendrilSink` +/// so that Unicode input may be provided incrementally, +/// or all at once with the `one` method. +/// +/// If your input is bytes, use `Parser::from_utf8`. +pub fn parse_document(sink: Sink, opts: ParseOpts) -> Parser +where + Sink: TreeSink, +{ + let tb = TreeBuilder::new(sink, opts.tree_builder); + let tok = Tokenizer::new(tb, opts.tokenizer); + Parser { + tokenizer: tok, + input_buffer: BufferQueue::default(), + } +} + +/// Parse an HTML fragment +/// +/// The returned value implements `tendril::TendrilSink` +/// so that Unicode input may be provided incrementally, +/// or all at once with the `one` method. +/// +/// If your input is bytes, use `Parser::from_utf8`. +pub fn parse_fragment( + sink: Sink, + opts: ParseOpts, + context_name: QualName, + context_attrs: Vec, +) -> Parser +where + Sink: TreeSink, +{ + let context_elem = create_element(&sink, context_name, context_attrs); + parse_fragment_for_element(sink, opts, context_elem, None) +} + +/// Like `parse_fragment`, but with an existing context element +/// and optionally a form element. +pub fn parse_fragment_for_element( + sink: Sink, + opts: ParseOpts, + context_element: Sink::Handle, + form_element: Option, +) -> Parser +where + Sink: TreeSink, +{ + let tb = TreeBuilder::new_for_fragment(sink, context_element, form_element, opts.tree_builder); + let tok_opts = TokenizerOpts { + initial_state: Some(tb.tokenizer_state_for_context_elem()), + ..opts.tokenizer + }; + let tok = Tokenizer::new(tb, tok_opts); + Parser { + tokenizer: tok, + input_buffer: BufferQueue::default(), + } +} + +/// An HTML parser, +/// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods. +pub struct Parser +where + Sink: TreeSink, +{ + pub tokenizer: Tokenizer>, + pub input_buffer: BufferQueue, +} + +impl TendrilSink for Parser { + fn process(&mut self, t: StrTendril) { + self.input_buffer.push_back(t); + // FIXME: Properly support somehow. + while let TokenizerResult::Script(_) = self.tokenizer.feed(&self.input_buffer) {} + } + + // FIXME: Is it too noisy to report every character decoding error? + fn error(&mut self, desc: Cow<'static, str>) { + self.tokenizer.sink.sink.parse_error(desc) + } + + type Output = Sink::Output; + + fn finish(self) -> Self::Output { + // FIXME: Properly support somehow. + while let TokenizerResult::Script(_) = self.tokenizer.feed(&self.input_buffer) {} + assert!(self.input_buffer.is_empty()); + self.tokenizer.end(); + self.tokenizer.sink.sink.finish() + } +} + +impl Parser { + /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes. + /// + /// Use this when your input is bytes that are known to be in the UTF-8 encoding. + /// Decoding is lossy, like `String::from_utf8_lossy`. + #[allow(clippy::wrong_self_convention)] + pub fn from_utf8(self) -> Utf8LossyDecoder { + Utf8LossyDecoder::new(self) + } +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/src/lib.rs b/collector/compile-benchmarks/html5ever-0.31.0/src/lib.rs new file mode 100644 index 000000000..e1415f602 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/src/lib.rs @@ -0,0 +1,31 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![crate_name = "html5ever"] +#![crate_type = "dylib"] +#![cfg_attr(test, deny(warnings))] +#![allow(unused_parens)] +#![warn(unreachable_pub)] + +pub use driver::{parse_document, parse_fragment, ParseOpts, Parser}; +pub use markup5ever::*; + +pub use serialize::serialize; + +#[macro_use] +mod macros; + +mod util { + pub(crate) mod str; +} + +pub mod driver; +pub mod serialize; +pub mod tokenizer; +pub mod tree_builder; diff --git a/collector/compile-benchmarks/html5ever-0.31.0/src/macros.rs b/collector/compile-benchmarks/html5ever-0.31.0/src/macros.rs new file mode 100644 index 000000000..afc649a3e --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/src/macros.rs @@ -0,0 +1,36 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +macro_rules! unwrap_or_else { + ($opt:expr, $else_block:block) => {{ + let Some(x) = $opt else { $else_block }; + x + }}; +} + +macro_rules! unwrap_or_return { + ($opt:expr) => { + unwrap_or_else!($opt, { + return; + }) + }; + ($opt:expr, $retval:expr) => { + unwrap_or_else!($opt, { return $retval }) + }; +} + +macro_rules! time { + ($e:expr) => {{ + let now = ::std::time::Instant::now(); + let result = $e; + let d = now.elapsed(); + let dt = d.as_secs() * 1_000_000_000 + u64::from(d.subsec_nanos()); + (result, dt) + }}; +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/src/serialize/mod.rs b/collector/compile-benchmarks/html5ever-0.31.0/src/serialize/mod.rs new file mode 100644 index 000000000..710066a2b --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/src/serialize/mod.rs @@ -0,0 +1,255 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use log::warn; +pub use markup5ever::serialize::{AttrRef, Serialize, Serializer, TraversalScope}; +use markup5ever::{local_name, namespace_url, ns}; +use std::io::{self, Write}; + +use crate::{LocalName, QualName}; + +pub fn serialize(writer: Wr, node: &T, opts: SerializeOpts) -> io::Result<()> +where + Wr: Write, + T: Serialize, +{ + let mut ser = HtmlSerializer::new(writer, opts.clone()); + node.serialize(&mut ser, opts.traversal_scope) +} + +#[derive(Clone)] +pub struct SerializeOpts { + /// Is scripting enabled? Default: true + pub scripting_enabled: bool, + + /// Serialize the root node? Default: ChildrenOnly + pub traversal_scope: TraversalScope, + + /// If the serializer is asked to serialize an invalid tree, the default + /// behavior is to panic in the event that an `end_elem` is created without a + /// matching `start_elem`. Setting this to true will prevent those panics by + /// creating a default parent on the element stack. No extra start elem will + /// actually be written. Default: false + pub create_missing_parent: bool, +} + +impl Default for SerializeOpts { + fn default() -> SerializeOpts { + SerializeOpts { + scripting_enabled: true, + traversal_scope: TraversalScope::ChildrenOnly(None), + create_missing_parent: false, + } + } +} + +#[derive(Default)] +struct ElemInfo { + html_name: Option, + ignore_children: bool, +} + +pub struct HtmlSerializer { + pub writer: Wr, + opts: SerializeOpts, + stack: Vec, +} + +fn tagname(name: &QualName) -> LocalName { + match name.ns { + ns!(html) | ns!(mathml) | ns!(svg) => (), + ref ns => { + // FIXME(#122) + warn!("node with weird namespace {:?}", ns); + }, + } + + name.local.clone() +} + +impl HtmlSerializer { + pub fn new(writer: Wr, opts: SerializeOpts) -> Self { + let html_name = match opts.traversal_scope { + TraversalScope::IncludeNode | TraversalScope::ChildrenOnly(None) => None, + TraversalScope::ChildrenOnly(Some(ref n)) => Some(tagname(n)), + }; + HtmlSerializer { + writer, + opts, + stack: vec![ElemInfo { + html_name, + ignore_children: false, + }], + } + } + + fn parent(&mut self) -> &mut ElemInfo { + if self.stack.is_empty() { + if self.opts.create_missing_parent { + warn!("ElemInfo stack empty, creating new parent"); + self.stack.push(Default::default()); + } else { + panic!("no parent ElemInfo") + } + } + self.stack.last_mut().unwrap() + } + + fn write_escaped(&mut self, text: &str, attr_mode: bool) -> io::Result<()> { + for c in text.chars() { + match c { + '&' => self.writer.write_all(b"&"), + '\u{00A0}' => self.writer.write_all(b" "), + '"' if attr_mode => self.writer.write_all(b"""), + '<' if !attr_mode => self.writer.write_all(b"<"), + '>' if !attr_mode => self.writer.write_all(b">"), + c => self.writer.write_fmt(format_args!("{c}")), + }?; + } + Ok(()) + } +} + +impl Serializer for HtmlSerializer { + fn start_elem<'a, AttrIter>(&mut self, name: QualName, attrs: AttrIter) -> io::Result<()> + where + AttrIter: Iterator>, + { + let html_name = match name.ns { + ns!(html) => Some(name.local.clone()), + _ => None, + }; + + if self.parent().ignore_children { + self.stack.push(ElemInfo { + html_name, + ignore_children: true, + }); + return Ok(()); + } + + self.writer.write_all(b"<")?; + self.writer.write_all(tagname(&name).as_bytes())?; + for (name, value) in attrs { + self.writer.write_all(b" ")?; + + match name.ns { + ns!() => (), + ns!(xml) => self.writer.write_all(b"xml:")?, + ns!(xmlns) => { + if name.local != local_name!("xmlns") { + self.writer.write_all(b"xmlns:")?; + } + }, + ns!(xlink) => self.writer.write_all(b"xlink:")?, + ref ns => { + // FIXME(#122) + warn!("attr with weird namespace {:?}", ns); + self.writer.write_all(b"unknown_namespace:")?; + }, + } + + self.writer.write_all(name.local.as_bytes())?; + self.writer.write_all(b"=\"")?; + self.write_escaped(value, true)?; + self.writer.write_all(b"\"")?; + } + self.writer.write_all(b">")?; + + let ignore_children = name.ns == ns!(html) + && matches!( + name.local, + local_name!("area") + | local_name!("base") + | local_name!("basefont") + | local_name!("bgsound") + | local_name!("br") + | local_name!("col") + | local_name!("embed") + | local_name!("frame") + | local_name!("hr") + | local_name!("img") + | local_name!("input") + | local_name!("keygen") + | local_name!("link") + | local_name!("meta") + | local_name!("param") + | local_name!("source") + | local_name!("track") + | local_name!("wbr") + ); + + self.stack.push(ElemInfo { + html_name, + ignore_children, + }); + + Ok(()) + } + + fn end_elem(&mut self, name: QualName) -> io::Result<()> { + let info = match self.stack.pop() { + Some(info) => info, + None if self.opts.create_missing_parent => { + warn!("missing ElemInfo, creating default."); + Default::default() + }, + _ => panic!("no ElemInfo"), + }; + if info.ignore_children { + return Ok(()); + } + + self.writer.write_all(b"") + } + + fn write_text(&mut self, text: &str) -> io::Result<()> { + let escape = match self.parent().html_name { + Some(local_name!("style")) + | Some(local_name!("script")) + | Some(local_name!("xmp")) + | Some(local_name!("iframe")) + | Some(local_name!("noembed")) + | Some(local_name!("noframes")) + | Some(local_name!("plaintext")) => false, + + Some(local_name!("noscript")) => !self.opts.scripting_enabled, + + _ => true, + }; + + if escape { + self.write_escaped(text, false) + } else { + self.writer.write_all(text.as_bytes()) + } + } + + fn write_comment(&mut self, text: &str) -> io::Result<()> { + self.writer.write_all(b"") + } + + fn write_doctype(&mut self, name: &str) -> io::Result<()> { + self.writer.write_all(b"") + } + + fn write_processing_instruction(&mut self, target: &str, data: &str) -> io::Result<()> { + self.writer.write_all(b"") + } +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/char_ref/mod.rs b/collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/char_ref/mod.rs new file mode 100644 index 000000000..2f3c6c663 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/char_ref/mod.rs @@ -0,0 +1,445 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use super::{TokenSink, Tokenizer}; +use crate::buffer_queue::BufferQueue; +use crate::data; +use crate::tendril::StrTendril; + +use log::debug; +use mac::format_if; +use std::borrow::Cow::Borrowed; +use std::char::from_u32; + +use self::State::*; +pub(super) use self::Status::*; + +//§ tokenizing-character-references +pub(super) struct CharRef { + /// The resulting character(s) + pub(super) chars: [char; 2], + + /// How many slots in `chars` are valid? + pub(super) num_chars: u8, +} + +pub(super) enum Status { + Stuck, + Progress, + Done, +} + +#[derive(Debug)] +enum State { + Begin, + Octothorpe, + Numeric(u32), // base + NumericSemicolon, + Named, + BogusName, +} + +pub(super) struct CharRefTokenizer { + state: State, + result: Option, + is_consumed_in_attribute: bool, + + num: u32, + num_too_big: bool, + seen_digit: bool, + hex_marker: Option, + + name_buf_opt: Option, + name_match: Option<(u32, u32)>, + name_len: usize, +} + +impl CharRefTokenizer { + pub(super) fn new(is_consumed_in_attribute: bool) -> CharRefTokenizer { + CharRefTokenizer { + is_consumed_in_attribute, + state: Begin, + result: None, + num: 0, + num_too_big: false, + seen_digit: false, + hex_marker: None, + name_buf_opt: None, + name_match: None, + name_len: 0, + } + } + + // A CharRefTokenizer can only tokenize one character reference, + // so this method consumes the tokenizer. + pub(super) fn get_result(self) -> CharRef { + self.result.expect("get_result called before done") + } + + fn name_buf(&self) -> &StrTendril { + self.name_buf_opt + .as_ref() + .expect("name_buf missing in named character reference") + } + + fn name_buf_mut(&mut self) -> &mut StrTendril { + self.name_buf_opt + .as_mut() + .expect("name_buf missing in named character reference") + } + + fn finish_none(&mut self) -> Status { + self.result = Some(CharRef { + chars: ['\0', '\0'], + num_chars: 0, + }); + Done + } + + fn finish_one(&mut self, c: char) -> Status { + self.result = Some(CharRef { + chars: [c, '\0'], + num_chars: 1, + }); + Done + } +} + +impl CharRefTokenizer { + pub(super) fn step( + &mut self, + tokenizer: &Tokenizer, + input: &BufferQueue, + ) -> Status { + if self.result.is_some() { + return Done; + } + + debug!("char ref tokenizer stepping in state {:?}", self.state); + match self.state { + Begin => self.do_begin(tokenizer, input), + Octothorpe => self.do_octothorpe(tokenizer, input), + Numeric(base) => self.do_numeric(tokenizer, input, base), + NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), + Named => self.do_named(tokenizer, input), + BogusName => self.do_bogus_name(tokenizer, input), + } + } + + fn do_begin( + &mut self, + tokenizer: &Tokenizer, + input: &BufferQueue, + ) -> Status { + match unwrap_or_return!(tokenizer.peek(input), Stuck) { + 'a'..='z' | 'A'..='Z' | '0'..='9' => { + self.state = Named; + self.name_buf_opt = Some(StrTendril::new()); + Progress + }, + + '#' => { + tokenizer.discard_char(input); + self.state = Octothorpe; + Progress + }, + _ => self.finish_none(), + } + } + + fn do_octothorpe( + &mut self, + tokenizer: &Tokenizer, + input: &BufferQueue, + ) -> Status { + let c = unwrap_or_return!(tokenizer.peek(input), Stuck); + match c { + 'x' | 'X' => { + tokenizer.discard_char(input); + self.hex_marker = Some(c); + self.state = Numeric(16); + }, + + _ => { + self.hex_marker = None; + self.state = Numeric(10); + }, + } + Progress + } + + fn do_numeric( + &mut self, + tokenizer: &Tokenizer, + input: &BufferQueue, + base: u32, + ) -> Status { + let c = unwrap_or_return!(tokenizer.peek(input), Stuck); + match c.to_digit(base) { + Some(n) => { + tokenizer.discard_char(input); + self.num = self.num.wrapping_mul(base); + if self.num > 0x10FFFF { + // We might overflow, and the character is definitely invalid. + // We still parse digits and semicolon, but don't use the result. + self.num_too_big = true; + } + self.num = self.num.wrapping_add(n); + self.seen_digit = true; + Progress + }, + + None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), + + None => { + self.state = NumericSemicolon; + Progress + }, + } + } + + fn do_numeric_semicolon( + &mut self, + tokenizer: &Tokenizer, + input: &BufferQueue, + ) -> Status { + match unwrap_or_return!(tokenizer.peek(input), Stuck) { + ';' => tokenizer.discard_char(input), + _ => tokenizer.emit_error(Borrowed( + "Semicolon missing after numeric character reference", + )), + }; + self.finish_numeric(tokenizer) + } + + fn unconsume_numeric( + &mut self, + tokenizer: &Tokenizer, + input: &BufferQueue, + ) -> Status { + let mut unconsume = StrTendril::from_char('#'); + if let Some(c) = self.hex_marker { + unconsume.push_char(c) + } + + input.push_front(unconsume); + tokenizer.emit_error(Borrowed("Numeric character reference without digits")); + self.finish_none() + } + + fn finish_numeric(&mut self, tokenizer: &Tokenizer) -> Status { + fn conv(n: u32) -> char { + from_u32(n).expect("invalid char missed by error handling cases") + } + + let (c, error) = match self.num { + n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true), + 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true), + + 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { + Some(c) => (c, true), + None => (conv(self.num), true), + }, + + 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true), + + n if (n & 0xFFFE) == 0xFFFE => (conv(n), true), + + n => (conv(n), false), + }; + + if error { + let msg = format_if!( + tokenizer.opts.exact_errors, + "Invalid numeric character reference", + "Invalid numeric character reference value 0x{:06X}", + self.num + ); + tokenizer.emit_error(msg); + } + + self.finish_one(c) + } + + fn do_named( + &mut self, + tokenizer: &Tokenizer, + input: &BufferQueue, + ) -> Status { + // peek + discard skips over newline normalization, therefore making it easier to + // un-consume + let c = unwrap_or_return!(tokenizer.peek(input), Stuck); + tokenizer.discard_char(input); + self.name_buf_mut().push_char(c); + match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { + // We have either a full match or a prefix of one. + Some(&m) => { + if m.0 != 0 { + // We have a full match, but there might be a longer one to come. + self.name_match = Some(m); + self.name_len = self.name_buf().len(); + } + // Otherwise we just have a prefix match. + Progress + }, + + // Can't continue the match. + None => self.finish_named(tokenizer, input, Some(c)), + } + } + + fn emit_name_error(&mut self, tokenizer: &Tokenizer) { + let msg = format_if!( + tokenizer.opts.exact_errors, + "Invalid character reference", + "Invalid character reference &{}", + self.name_buf() + ); + tokenizer.emit_error(msg); + } + + fn unconsume_name(&mut self, input: &BufferQueue) { + input.push_front(self.name_buf_opt.take().unwrap()); + } + + fn finish_named( + &mut self, + tokenizer: &Tokenizer, + input: &BufferQueue, + end_char: Option, + ) -> Status { + match self.name_match { + None => { + match end_char { + Some(c) if c.is_ascii_alphanumeric() => { + // Keep looking for a semicolon, to determine whether + // we emit a parse error. + self.state = BogusName; + return Progress; + }, + + // Check length because &; is not a parse error. + Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer), + + _ => (), + } + self.unconsume_name(input); + self.finish_none() + }, + + Some((c1, c2)) => { + // We have a complete match, but we may have consumed + // additional characters into self.name_buf. Usually + // at least one, but several in cases like + // + // ¬ => match for U+00AC + // ¬i => valid prefix for ¬in + // ¬it => can't continue match + + let name_len = self.name_len; + assert!(name_len > 0); + let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap(); + + // There might not be a next character after the match, if + // we had a full match and then hit EOF. + let next_after = if name_len == self.name_buf().len() { + None + } else { + Some(self.name_buf()[name_len..].chars().next().unwrap()) + }; + + // If the character reference was consumed as part of an attribute, and the last + // character matched is not a U+003B SEMICOLON character (;), and the next input + // character is either a U+003D EQUALS SIGN character (=) or an ASCII alphanumeric, + // then, for historical reasons, flush code points consumed as a character + // reference and switch to the return state. + + let unconsume_all = match (self.is_consumed_in_attribute, last_matched, next_after) + { + (_, ';', _) => false, + (true, _, Some('=')) => true, + (true, _, Some(c)) if c.is_ascii_alphanumeric() => true, + _ => { + // 1. If the last character matched is not a U+003B SEMICOLON character + // (;), then this is a missing-semicolon-after-character-reference parse + // error. + tokenizer.emit_error(Borrowed( + "Character reference does not end with semicolon", + )); + false + }, + }; + + if unconsume_all { + self.unconsume_name(input); + self.finish_none() + } else { + input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..])); + tokenizer.ignore_lf.set(false); + self.result = Some(CharRef { + chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], + num_chars: if c2 == 0 { 1 } else { 2 }, + }); + Done + } + }, + } + } + + fn do_bogus_name( + &mut self, + tokenizer: &Tokenizer, + input: &BufferQueue, + ) -> Status { + // peek + discard skips over newline normalization, therefore making it easier to + // un-consume + let c = unwrap_or_return!(tokenizer.peek(input), Stuck); + tokenizer.discard_char(input); + self.name_buf_mut().push_char(c); + match c { + _ if c.is_ascii_alphanumeric() => return Progress, + ';' => self.emit_name_error(tokenizer), + _ => (), + } + self.unconsume_name(input); + self.finish_none() + } + + pub(super) fn end_of_file( + &mut self, + tokenizer: &Tokenizer, + input: &BufferQueue, + ) { + while self.result.is_none() { + match self.state { + Begin => drop(self.finish_none()), + + Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)), + + Numeric(_) | NumericSemicolon => { + tokenizer.emit_error(Borrowed("EOF in numeric character reference")); + self.finish_numeric(tokenizer); + }, + + Named => drop(self.finish_named(tokenizer, input, None)), + + BogusName => { + self.unconsume_name(input); + self.finish_none(); + }, + + Octothorpe => { + input.push_front(StrTendril::from_slice("#")); + tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); + self.finish_none(); + }, + } + } + } +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/interface.rs b/collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/interface.rs new file mode 100644 index 000000000..edc6afb99 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/interface.rs @@ -0,0 +1,99 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use crate::interface::Attribute; +use crate::tendril::StrTendril; +use crate::tokenizer::states; +use crate::LocalName; +use std::borrow::Cow; + +pub use self::TagKind::{EndTag, StartTag}; +pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken}; +pub use self::Token::{EOFToken, NullCharacterToken, ParseError}; + +/// A `DOCTYPE` token. +// FIXME: already exists in Servo DOM +#[derive(PartialEq, Eq, Clone, Debug, Default)] +pub struct Doctype { + pub name: Option, + pub public_id: Option, + pub system_id: Option, + pub force_quirks: bool, +} + +#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)] +pub enum TagKind { + StartTag, + EndTag, +} + +/// A tag token. +#[derive(PartialEq, Eq, Clone, Debug)] +pub struct Tag { + pub kind: TagKind, + pub name: LocalName, + pub self_closing: bool, + pub attrs: Vec, +} + +impl Tag { + /// Are the tags equivalent when we don't care about attribute order? + /// Also ignores the self-closing flag. + pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool { + if (self.kind != other.kind) || (self.name != other.name) { + return false; + } + + let mut self_attrs = self.attrs.clone(); + let mut other_attrs = other.attrs.clone(); + self_attrs.sort(); + other_attrs.sort(); + + self_attrs == other_attrs + } +} + +#[derive(PartialEq, Eq, Debug)] +pub enum Token { + DoctypeToken(Doctype), + TagToken(Tag), + CommentToken(StrTendril), + CharacterTokens(StrTendril), + NullCharacterToken, + EOFToken, + ParseError(Cow<'static, str>), +} + +#[derive(Debug, PartialEq)] +#[must_use] +pub enum TokenSinkResult { + Continue, + Script(Handle), + Plaintext, + RawData(states::RawKind), +} + +/// Types which can receive tokens from the tokenizer. +pub trait TokenSink { + type Handle; + + /// Process a token. + fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult; + + // Signal sink that tokenization reached the end. + fn end(&self) {} + + /// Used in the markup declaration open state. By default, this always + /// returns false and thus all CDATA sections are tokenized as bogus + /// comments. + /// + fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool { + false + } +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/mod.rs b/collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/mod.rs new file mode 100644 index 000000000..b98be420b --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/mod.rs @@ -0,0 +1,1761 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! The HTML5 tokenizer. + +pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; +pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token}; +pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind}; +pub use self::interface::{TokenSink, TokenSinkResult}; + +use self::states::{DoctypeIdKind, Public, System}; +use self::states::{DoubleEscaped, Escaped}; +use self::states::{DoubleQuoted, SingleQuoted, Unquoted}; +use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped}; + +use self::char_ref::{CharRef, CharRefTokenizer}; + +use crate::util::str::lower_ascii_letter; + +use log::{debug, trace}; +use mac::format_if; +use markup5ever::{namespace_url, ns, small_char_set, TokenizerResult}; +use std::borrow::Cow::{self, Borrowed}; +use std::cell::{Cell, RefCell, RefMut}; +use std::collections::BTreeMap; +use std::mem; + +pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult}; +use crate::tendril::StrTendril; +use crate::{Attribute, LocalName, QualName, SmallCharSet}; + +mod char_ref; +mod interface; +pub mod states; + +pub enum ProcessResult { + Continue, + Suspend, + Script(Handle), +} + +fn option_push(opt_str: &mut Option, c: char) { + match *opt_str { + Some(ref mut s) => s.push_char(c), + None => *opt_str = Some(StrTendril::from_char(c)), + } +} + +/// Tokenizer options, with an impl for `Default`. +#[derive(Clone)] +pub struct TokenizerOpts { + /// Report all parse errors described in the spec, at some + /// performance penalty? Default: false + pub exact_errors: bool, + + /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning + /// of the stream? Default: true + pub discard_bom: bool, + + /// Keep a record of how long we spent in each state? Printed + /// when `end()` is called. Default: false + pub profile: bool, + + /// Initial state override. Only the test runner should use + /// a non-`None` value! + pub initial_state: Option, + + /// Last start tag. Only the test runner should use a + /// non-`None` value! + /// + /// FIXME: Can't use Tendril because we want TokenizerOpts + /// to be Send. + pub last_start_tag_name: Option, +} + +impl Default for TokenizerOpts { + fn default() -> TokenizerOpts { + TokenizerOpts { + exact_errors: false, + discard_bom: true, + profile: false, + initial_state: None, + last_start_tag_name: None, + } + } +} + +/// The HTML tokenizer. +pub struct Tokenizer { + /// Options controlling the behavior of the tokenizer. + opts: TokenizerOpts, + + /// Destination for tokens we emit. + pub sink: Sink, + + /// The abstract machine state as described in the spec. + state: Cell, + + /// Are we at the end of the file, once buffers have been processed + /// completely? This affects whether we will wait for lookahead or not. + at_eof: Cell, + + /// Tokenizer for character references, if we're tokenizing + /// one at the moment. + char_ref_tokenizer: RefCell>>, + + /// Current input character. Just consumed, may reconsume. + current_char: Cell, + + /// Should we reconsume the current input character? + reconsume: Cell, + + /// Did we just consume \r, translating it to \n? In that case we need + /// to ignore the next character if it's \n. + ignore_lf: Cell, + + /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the + /// beginning of the stream. + discard_bom: Cell, + + /// Current tag kind. + current_tag_kind: Cell, + + /// Current tag name. + current_tag_name: RefCell, + + /// Current tag is self-closing? + current_tag_self_closing: Cell, + + /// Current tag attributes. + current_tag_attrs: RefCell>, + + /// Current attribute name. + current_attr_name: RefCell, + + /// Current attribute value. + current_attr_value: RefCell, + + /// Current comment. + current_comment: RefCell, + + /// Current doctype token. + current_doctype: RefCell, + + /// Last start tag name, for use in checking "appropriate end tag". + last_start_tag_name: RefCell>, + + /// The "temporary buffer" mentioned in the spec. + temp_buf: RefCell, + + /// Record of how many ns we spent in each state, if profiling is enabled. + state_profile: RefCell>, + + /// Record of how many ns we spent in the token sink. + time_in_sink: Cell, + + /// Track current line + current_line: Cell, +} + +impl Tokenizer { + /// Create a new tokenizer which feeds tokens to a particular `TokenSink`. + pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer { + let start_tag_name = opts + .last_start_tag_name + .take() + .map(|s| LocalName::from(&*s)); + let state = opts.initial_state.unwrap_or(states::Data); + let discard_bom = opts.discard_bom; + Tokenizer { + opts, + sink, + state: Cell::new(state), + char_ref_tokenizer: RefCell::new(None), + at_eof: Cell::new(false), + current_char: Cell::new('\0'), + reconsume: Cell::new(false), + ignore_lf: Cell::new(false), + discard_bom: Cell::new(discard_bom), + current_tag_kind: Cell::new(StartTag), + current_tag_name: RefCell::new(StrTendril::new()), + current_tag_self_closing: Cell::new(false), + current_tag_attrs: RefCell::new(vec![]), + current_attr_name: RefCell::new(StrTendril::new()), + current_attr_value: RefCell::new(StrTendril::new()), + current_comment: RefCell::new(StrTendril::new()), + current_doctype: RefCell::new(Doctype::default()), + last_start_tag_name: RefCell::new(start_tag_name), + temp_buf: RefCell::new(StrTendril::new()), + state_profile: RefCell::new(BTreeMap::new()), + time_in_sink: Cell::new(0), + current_line: Cell::new(1), + } + } + + /// Feed an input string into the tokenizer. + pub fn feed(&self, input: &BufferQueue) -> TokenizerResult { + if input.is_empty() { + return TokenizerResult::Done; + } + + if self.discard_bom.get() { + if let Some(c) = input.peek() { + if c == '\u{feff}' { + input.next(); + } + } else { + return TokenizerResult::Done; + } + }; + + self.run(input) + } + + pub fn set_plaintext_state(&self) { + self.state.set(states::Plaintext); + } + + fn process_token(&self, token: Token) -> TokenSinkResult { + if self.opts.profile { + let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get())); + self.time_in_sink.set(self.time_in_sink.get() + dt); + ret + } else { + self.sink.process_token(token, self.current_line.get()) + } + } + + fn process_token_and_continue(&self, token: Token) { + assert!(matches!( + self.process_token(token), + TokenSinkResult::Continue + )); + } + + //§ preprocessing-the-input-stream + // Get the next input character, which might be the character + // 'c' that we already consumed from the buffers. + fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option { + if self.ignore_lf.get() { + self.ignore_lf.set(false); + if c == '\n' { + c = input.next()?; + } + } + + if c == '\r' { + self.ignore_lf.set(true); + c = '\n'; + } + + if c == '\n' { + self.current_line.set(self.current_line.get() + 1); + } + + if self.opts.exact_errors + && match c as u32 { + 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true, + n if (n & 0xFFFE) == 0xFFFE => true, + _ => false, + } + { + let msg = format!("Bad character {c}"); + self.emit_error(Cow::Owned(msg)); + } + + trace!("got character {}", c); + self.current_char.set(c); + Some(c) + } + + //§ tokenization + // Get the next input character, if one is available. + fn get_char(&self, input: &BufferQueue) -> Option { + if self.reconsume.get() { + self.reconsume.set(false); + Some(self.current_char.get()) + } else { + input + .next() + .and_then(|c| self.get_preprocessed_char(c, input)) + } + } + + fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option { + // Bail to the slow path for various corner cases. + // This means that `FromSet` can contain characters not in the set! + // It shouldn't matter because the fallback `FromSet` case should + // always do the same thing as the `NotFromSet` case. + if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() { + return self.get_char(input).map(FromSet); + } + + let d = input.pop_except_from(set); + trace!("got characters {:?}", d); + match d { + Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet), + + // NB: We don't set self.current_char for a run of characters not + // in the set. It shouldn't matter for the codepaths that use + // this. + _ => d, + } + } + + // Check if the next characters are an ASCII case-insensitive match. See + // BufferQueue::eat. + // + // NB: this doesn't set the current input character. + fn eat(&self, input: &BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option { + if self.ignore_lf.get() { + self.ignore_lf.set(false); + if self.peek(input) == Some('\n') { + self.discard_char(input); + } + } + + input.push_front(mem::take(&mut self.temp_buf.borrow_mut())); + match input.eat(pat, eq) { + None if self.at_eof.get() => Some(false), + None => { + while let Some(data) = input.next() { + self.temp_buf.borrow_mut().push_char(data); + } + None + }, + Some(matched) => Some(matched), + } + } + + /// Run the state machine for as long as we can. + fn run(&self, input: &BufferQueue) -> TokenizerResult { + if self.opts.profile { + loop { + let state = self.state.get(); + let old_sink = self.time_in_sink.get(); + let (run, mut dt) = time!(self.step(input)); + dt -= (self.time_in_sink.get() - old_sink); + let new = match self.state_profile.borrow_mut().get_mut(&state) { + Some(x) => { + *x += dt; + false + }, + None => true, + }; + if new { + // do this here because of borrow shenanigans + self.state_profile.borrow_mut().insert(state, dt); + } + match run { + ProcessResult::Continue => (), + ProcessResult::Suspend => break, + ProcessResult::Script(node) => return TokenizerResult::Script(node), + } + } + } else { + loop { + match self.step(input) { + ProcessResult::Continue => (), + ProcessResult::Suspend => break, + ProcessResult::Script(node) => return TokenizerResult::Script(node), + } + } + } + TokenizerResult::Done + } + + fn bad_char_error(&self) { + let msg = format_if!( + self.opts.exact_errors, + "Bad character", + "Saw {} in state {:?}", + self.current_char.get(), + self.state.get() + ); + self.emit_error(msg); + } + + fn bad_eof_error(&self) { + let msg = format_if!( + self.opts.exact_errors, + "Unexpected EOF", + "Saw EOF in state {:?}", + self.state.get() + ); + self.emit_error(msg); + } + + fn emit_char(&self, c: char) { + self.process_token_and_continue(match c { + '\0' => NullCharacterToken, + _ => CharacterTokens(StrTendril::from_char(c)), + }); + } + + // The string must not contain '\0'! + fn emit_chars(&self, b: StrTendril) { + self.process_token_and_continue(CharacterTokens(b)); + } + + fn emit_current_tag(&self) -> ProcessResult { + self.finish_attribute(); + + let name = LocalName::from(&**self.current_tag_name.borrow()); + self.current_tag_name.borrow_mut().clear(); + + match self.current_tag_kind.get() { + StartTag => { + *self.last_start_tag_name.borrow_mut() = Some(name.clone()); + }, + EndTag => { + if !self.current_tag_attrs.borrow().is_empty() { + self.emit_error(Borrowed("Attributes on an end tag")); + } + if self.current_tag_self_closing.get() { + self.emit_error(Borrowed("Self-closing end tag")); + } + }, + } + + let token = TagToken(Tag { + kind: self.current_tag_kind.get(), + name, + self_closing: self.current_tag_self_closing.get(), + attrs: std::mem::take(&mut self.current_tag_attrs.borrow_mut()), + }); + + match self.process_token(token) { + TokenSinkResult::Continue => ProcessResult::Continue, + TokenSinkResult::Plaintext => { + self.state.set(states::Plaintext); + ProcessResult::Continue + }, + TokenSinkResult::Script(node) => { + self.state.set(states::Data); + ProcessResult::Script(node) + }, + TokenSinkResult::RawData(kind) => { + self.state.set(states::RawData(kind)); + ProcessResult::Continue + }, + } + } + + fn emit_temp_buf(&self) { + // FIXME: Make sure that clearing on emit is spec-compatible. + let buf = mem::take(&mut *self.temp_buf.borrow_mut()); + self.emit_chars(buf); + } + + fn clear_temp_buf(&self) { + // Do this without a new allocation. + self.temp_buf.borrow_mut().clear(); + } + + fn emit_current_comment(&self) { + let comment = mem::take(&mut *self.current_comment.borrow_mut()); + self.process_token_and_continue(CommentToken(comment)); + } + + fn discard_tag(&self) { + self.current_tag_name.borrow_mut().clear(); + self.current_tag_self_closing.set(false); + *self.current_tag_attrs.borrow_mut() = vec![]; + } + + fn create_tag(&self, kind: TagKind, c: char) { + self.discard_tag(); + self.current_tag_name.borrow_mut().push_char(c); + self.current_tag_kind.set(kind); + } + + fn have_appropriate_end_tag(&self) -> bool { + match self.last_start_tag_name.borrow().as_ref() { + Some(last) => { + (self.current_tag_kind.get() == EndTag) + && (**self.current_tag_name.borrow() == **last) + }, + None => false, + } + } + + fn create_attribute(&self, c: char) { + self.finish_attribute(); + + self.current_attr_name.borrow_mut().push_char(c); + } + + fn finish_attribute(&self) { + if self.current_attr_name.borrow().is_empty() { + return; + } + + // Check for a duplicate attribute. + // FIXME: the spec says we should error as soon as the name is finished. + let dup = { + let name = &*self.current_attr_name.borrow(); + self.current_tag_attrs + .borrow() + .iter() + .any(|a| *a.name.local == **name) + }; + + if dup { + self.emit_error(Borrowed("Duplicate attribute")); + self.current_attr_name.borrow_mut().clear(); + self.current_attr_value.borrow_mut().clear(); + } else { + let name = LocalName::from(&**self.current_attr_name.borrow()); + self.current_attr_name.borrow_mut().clear(); + self.current_tag_attrs.borrow_mut().push(Attribute { + // The tree builder will adjust the namespace if necessary. + // This only happens in foreign elements. + name: QualName::new(None, ns!(), name), + value: mem::take(&mut self.current_attr_value.borrow_mut()), + }); + } + } + + fn emit_current_doctype(&self) { + let doctype = self.current_doctype.take(); + self.process_token_and_continue(DoctypeToken(doctype)); + } + + fn doctype_id(&self, kind: DoctypeIdKind) -> RefMut> { + let current_doctype = self.current_doctype.borrow_mut(); + match kind { + Public => RefMut::map(current_doctype, |d| &mut d.public_id), + System => RefMut::map(current_doctype, |d| &mut d.system_id), + } + } + + fn clear_doctype_id(&self, kind: DoctypeIdKind) { + let mut id = self.doctype_id(kind); + match *id { + Some(ref mut s) => s.clear(), + None => *id = Some(StrTendril::new()), + } + } + + fn consume_char_ref(&self) { + *self.char_ref_tokenizer.borrow_mut() = Some(Box::new(CharRefTokenizer::new(matches!( + self.state.get(), + states::AttributeValue(_) + )))); + } + + fn emit_eof(&self) { + self.process_token_and_continue(EOFToken); + } + + fn peek(&self, input: &BufferQueue) -> Option { + if self.reconsume.get() { + Some(self.current_char.get()) + } else { + input.peek() + } + } + + fn discard_char(&self, input: &BufferQueue) { + // peek() deals in un-processed characters (no newline normalization), while get_char() + // does. + // + // since discard_char is supposed to be used in combination with peek(), discard_char must + // discard a single raw input character, not a normalized newline. + if self.reconsume.get() { + self.reconsume.set(false); + } else { + input.next(); + } + } + + fn emit_error(&self, error: Cow<'static, str>) { + self.process_token_and_continue(ParseError(error)); + } +} +//§ END + +// Shorthand for common state machine behaviors. +macro_rules! shorthand ( + ( $me:ident : emit $c:expr ) => ( $me.emit_char($c) ); + ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) ); + ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.borrow_mut().push_char($c) ); + ( $me:ident : discard_tag ) => ( $me.discard_tag() ); + ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input) ); + ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.borrow_mut().push_char($c) ); + ( $me:ident : emit_temp ) => ( $me.emit_temp_buf() ); + ( $me:ident : clear_temp ) => ( $me.clear_temp_buf() ); + ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) ); + ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.borrow_mut().push_char($c) ); + ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_char($c) ); + ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_tendril($c)); + ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_char($c) ); + ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_slice($c) ); + ( $me:ident : emit_comment ) => ( $me.emit_current_comment() ); + ( $me:ident : clear_comment ) => ( $me.current_comment.borrow_mut().clear() ); + ( $me:ident : create_doctype ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() ); + ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) ); + ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c) ); + ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) ); + ( $me:ident : force_quirks ) => ( $me.current_doctype.borrow_mut().force_quirks = true); + ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() ); + ( $me:ident : error ) => ( $me.bad_char_error() ); + ( $me:ident : error_eof ) => ( $me.bad_eof_error() ); +); + +// Tracing of tokenizer actions. This adds significant bloat and compile time, +// so it's behind a cfg flag. +#[cfg(feature = "trace_tokenizer")] +macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({ + trace!(" {:?}", stringify!($($cmds)*)); + shorthand!($me : $($cmds)*); +})); + +#[cfg(not(feature = "trace_tokenizer"))] +macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) ); + +// A little DSL for sequencing shorthand actions. +macro_rules! go ( + // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity. + // We have to tell the parser how much lookahead we need. + + ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); }); + ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); }); + ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); }); + ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); }); + + // These can only come at the end. + + ( $me:ident : to $s:ident ) => ({ $me.state.set(states::$s); return ProcessResult::Continue; }); + ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state.set(states::$s($k1)); return ProcessResult::Continue; }); + ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return ProcessResult::Continue; }); + + ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume.set(true); go!($me: to $s); }); + ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1); }); + ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); }); + + ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(); return ProcessResult::Continue; }); + + // We have a default next state after emitting a tag, but the sink can override. + ( $me:ident : emit_tag $s:ident ) => ({ + $me.state.set(states::$s); + return $me.emit_current_tag(); + }); + + ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; }); + + // If nothing else matched, it's a single command + ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) ); + + // or nothing. + ( $me:ident : ) => (()); +); + +macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => ( + match $x { + $($pats)|+ => go!($me: $($cmds)*), + _ => (), + } +)); + +// This is a macro because it can cause early return +// from the function where it is used. +macro_rules! get_char ( ($me:expr, $input:expr) => ( + unwrap_or_return!($me.get_char($input), ProcessResult::Suspend) +)); + +macro_rules! peek ( ($me:expr, $input:expr) => ( + unwrap_or_return!($me.peek($input), ProcessResult::Suspend) +)); + +macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => ( + unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend) +)); + +macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => ( + unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend) +)); + +macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => ( + unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend) +)); + +impl Tokenizer { + // Run the state machine for a while. + // Return true if we should be immediately re-invoked + // (this just simplifies control flow vs. break / continue). + #[allow(clippy::never_loop)] + fn step(&self, input: &BufferQueue) -> ProcessResult { + if self.char_ref_tokenizer.borrow().is_some() { + return self.step_char_ref_tokenizer(input); + } + + trace!("processing in state {:?}", self.state); + match self.state.get() { + //§ data-state + states::Data => loop { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) { + FromSet('\0') => go!(self: error; emit '\0'), + FromSet('&') => go!(self: consume_char_ref), + FromSet('<') => go!(self: to TagOpen), + FromSet(c) => go!(self: emit c), + NotFromSet(b) => self.emit_chars(b), + } + }, + + //§ rcdata-state + states::RawData(Rcdata) => loop { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) { + FromSet('\0') => go!(self: error; emit '\u{fffd}'), + FromSet('&') => go!(self: consume_char_ref), + FromSet('<') => go!(self: to RawLessThanSign Rcdata), + FromSet(c) => go!(self: emit c), + NotFromSet(b) => self.emit_chars(b), + } + }, + + //§ rawtext-state + states::RawData(Rawtext) => loop { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { + FromSet('\0') => go!(self: error; emit '\u{fffd}'), + FromSet('<') => go!(self: to RawLessThanSign Rawtext), + FromSet(c) => go!(self: emit c), + NotFromSet(b) => self.emit_chars(b), + } + }, + + //§ script-data-state + states::RawData(ScriptData) => loop { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { + FromSet('\0') => go!(self: error; emit '\u{fffd}'), + FromSet('<') => go!(self: to RawLessThanSign ScriptData), + FromSet(c) => go!(self: emit c), + NotFromSet(b) => self.emit_chars(b), + } + }, + + //§ script-data-escaped-state + states::RawData(ScriptDataEscaped(Escaped)) => loop { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) { + FromSet('\0') => go!(self: error; emit '\u{fffd}'), + FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped), + FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped), + FromSet(c) => go!(self: emit c), + NotFromSet(b) => self.emit_chars(b), + } + }, + + //§ script-data-double-escaped-state + states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) { + FromSet('\0') => go!(self: error; emit '\u{fffd}'), + FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped), + FromSet('<') => { + go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped) + }, + FromSet(c) => go!(self: emit c), + NotFromSet(b) => self.emit_chars(b), + } + }, + + //§ plaintext-state + states::Plaintext => loop { + match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) { + FromSet('\0') => go!(self: error; emit '\u{fffd}'), + FromSet(c) => go!(self: emit c), + NotFromSet(b) => self.emit_chars(b), + } + }, + + //§ tag-open-state + states::TagOpen => loop { + match get_char!(self, input) { + '!' => go!(self: to MarkupDeclarationOpen), + '/' => go!(self: to EndTagOpen), + '?' => go!(self: error; clear_comment; reconsume BogusComment), + c => match lower_ascii_letter(c) { + Some(cl) => go!(self: create_tag StartTag cl; to TagName), + None => go!(self: error; emit '<'; reconsume Data), + }, + } + }, + + //§ end-tag-open-state + states::EndTagOpen => loop { + match get_char!(self, input) { + '>' => go!(self: error; to Data), + c => match lower_ascii_letter(c) { + Some(cl) => go!(self: create_tag EndTag cl; to TagName), + None => go!(self: error; clear_comment; reconsume BogusComment), + }, + } + }, + + //§ tag-name-state + states::TagName => loop { + match get_char!(self, input) { + '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), + '/' => go!(self: to SelfClosingStartTag), + '>' => go!(self: emit_tag Data), + '\0' => go!(self: error; push_tag '\u{fffd}'), + c => go!(self: push_tag (c.to_ascii_lowercase())), + } + }, + + //§ script-data-escaped-less-than-sign-state + states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop { + match get_char!(self, input) { + '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped), + c => match lower_ascii_letter(c) { + Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c; + to ScriptDataEscapeStart DoubleEscaped), + None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped), + }, + } + }, + + //§ script-data-double-escaped-less-than-sign-state + states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop { + match get_char!(self, input) { + '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd), + _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), + } + }, + + //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state + // otherwise + states::RawLessThanSign(kind) => loop { + match get_char!(self, input) { + '/' => go!(self: clear_temp; to RawEndTagOpen kind), + '!' if kind == ScriptData => { + go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped) + }, + _ => go!(self: emit '<'; reconsume RawData kind), + } + }, + + //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state + states::RawEndTagOpen(kind) => loop { + let c = get_char!(self, input); + match lower_ascii_letter(c) { + Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind), + None => go!(self: emit '<'; emit '/'; reconsume RawData kind), + } + }, + + //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state + states::RawEndTagName(kind) => loop { + let c = get_char!(self, input); + if self.have_appropriate_end_tag() { + match c { + '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to BeforeAttributeName), + '/' => go!(self: clear_temp; to SelfClosingStartTag), + '>' => go!(self: clear_temp; emit_tag Data), + _ => (), + } + } + + match lower_ascii_letter(c) { + Some(cl) => go!(self: push_tag cl; push_temp c), + None => { + go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind) + }, + } + }, + + //§ script-data-double-escape-start-state + states::ScriptDataEscapeStart(DoubleEscaped) => loop { + let c = get_char!(self, input); + match c { + '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => { + let esc = if &**self.temp_buf.borrow() == "script" { + DoubleEscaped + } else { + Escaped + }; + go!(self: emit c; to RawData ScriptDataEscaped esc); + }, + _ => match lower_ascii_letter(c) { + Some(cl) => go!(self: push_temp cl; emit c), + None => go!(self: reconsume RawData ScriptDataEscaped Escaped), + }, + } + }, + + //§ script-data-escape-start-state + states::ScriptDataEscapeStart(Escaped) => loop { + match get_char!(self, input) { + '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash), + _ => go!(self: reconsume RawData ScriptData), + } + }, + + //§ script-data-escape-start-dash-state + states::ScriptDataEscapeStartDash => loop { + match get_char!(self, input) { + '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped), + _ => go!(self: reconsume RawData ScriptData), + } + }, + + //§ script-data-escaped-dash-state script-data-double-escaped-dash-state + states::ScriptDataEscapedDash(kind) => loop { + match get_char!(self, input) { + '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind), + '<' => { + if kind == DoubleEscaped { + go!(self: emit '<'); + } + go!(self: to RawLessThanSign ScriptDataEscaped kind); + }, + '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind), + c => go!(self: emit c; to RawData ScriptDataEscaped kind), + } + }, + + //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state + states::ScriptDataEscapedDashDash(kind) => loop { + match get_char!(self, input) { + '-' => go!(self: emit '-'), + '<' => { + if kind == DoubleEscaped { + go!(self: emit '<'); + } + go!(self: to RawLessThanSign ScriptDataEscaped kind); + }, + '>' => go!(self: emit '>'; to RawData ScriptData), + '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind), + c => go!(self: emit c; to RawData ScriptDataEscaped kind), + } + }, + + //§ script-data-double-escape-end-state + states::ScriptDataDoubleEscapeEnd => loop { + let c = get_char!(self, input); + match c { + '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => { + let esc = if &**self.temp_buf.borrow() == "script" { + Escaped + } else { + DoubleEscaped + }; + go!(self: emit c; to RawData ScriptDataEscaped esc); + }, + _ => match lower_ascii_letter(c) { + Some(cl) => go!(self: push_temp cl; emit c), + None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), + }, + } + }, + + //§ before-attribute-name-state + states::BeforeAttributeName => loop { + match get_char!(self, input) { + '\t' | '\n' | '\x0C' | ' ' => (), + '/' => go!(self: to SelfClosingStartTag), + '>' => go!(self: emit_tag Data), + '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName), + c => match lower_ascii_letter(c) { + Some(cl) => go!(self: create_attr cl; to AttributeName), + None => { + go_match!(self: c, + '"' , '\'' , '<' , '=' => error); + go!(self: create_attr c; to AttributeName); + }, + }, + } + }, + + //§ attribute-name-state + states::AttributeName => loop { + match get_char!(self, input) { + '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName), + '/' => go!(self: to SelfClosingStartTag), + '=' => go!(self: to BeforeAttributeValue), + '>' => go!(self: emit_tag Data), + '\0' => go!(self: error; push_name '\u{fffd}'), + c => match lower_ascii_letter(c) { + Some(cl) => go!(self: push_name cl), + None => { + go_match!(self: c, + '"' , '\'' , '<' => error); + go!(self: push_name c); + }, + }, + } + }, + + //§ after-attribute-name-state + states::AfterAttributeName => loop { + match get_char!(self, input) { + '\t' | '\n' | '\x0C' | ' ' => (), + '/' => go!(self: to SelfClosingStartTag), + '=' => go!(self: to BeforeAttributeValue), + '>' => go!(self: emit_tag Data), + '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName), + c => match lower_ascii_letter(c) { + Some(cl) => go!(self: create_attr cl; to AttributeName), + None => { + go_match!(self: c, + '"' , '\'' , '<' => error); + go!(self: create_attr c; to AttributeName); + }, + }, + } + }, + + //§ before-attribute-value-state + // Use peek so we can handle the first attr character along with the rest, + // hopefully in the same zero-copy buffer. + states::BeforeAttributeValue => loop { + match peek!(self, input) { + '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input), + '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted), + '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted), + '>' => go!(self: discard_char input; error; emit_tag Data), + _ => go!(self: to AttributeValue Unquoted), + } + }, + + //§ attribute-value-(double-quoted)-state + states::AttributeValue(DoubleQuoted) => loop { + match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) { + FromSet('"') => go!(self: to AfterAttributeValueQuoted), + FromSet('&') => go!(self: consume_char_ref), + FromSet('\0') => go!(self: error; push_value '\u{fffd}'), + FromSet(c) => go!(self: push_value c), + NotFromSet(ref b) => go!(self: append_value b), + } + }, + + //§ attribute-value-(single-quoted)-state + states::AttributeValue(SingleQuoted) => loop { + match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) { + FromSet('\'') => go!(self: to AfterAttributeValueQuoted), + FromSet('&') => go!(self: consume_char_ref), + FromSet('\0') => go!(self: error; push_value '\u{fffd}'), + FromSet(c) => go!(self: push_value c), + NotFromSet(ref b) => go!(self: append_value b), + } + }, + + //§ attribute-value-(unquoted)-state + states::AttributeValue(Unquoted) => loop { + match pop_except_from!( + self, + input, + small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0') + ) { + FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { + go!(self: to BeforeAttributeName) + }, + FromSet('&') => go!(self: consume_char_ref), + FromSet('>') => go!(self: emit_tag Data), + FromSet('\0') => go!(self: error; push_value '\u{fffd}'), + FromSet(c) => { + go_match!(self: c, + '"' , '\'' , '<' , '=' , '`' => error); + go!(self: push_value c); + }, + NotFromSet(ref b) => go!(self: append_value b), + } + }, + + //§ after-attribute-value-(quoted)-state + states::AfterAttributeValueQuoted => loop { + match get_char!(self, input) { + '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), + '/' => go!(self: to SelfClosingStartTag), + '>' => go!(self: emit_tag Data), + _ => go!(self: error; reconsume BeforeAttributeName), + } + }, + + //§ self-closing-start-tag-state + states::SelfClosingStartTag => loop { + match get_char!(self, input) { + '>' => { + self.current_tag_self_closing.set(true); + go!(self: emit_tag Data); + }, + _ => go!(self: error; reconsume BeforeAttributeName), + } + }, + + //§ comment-start-state + states::CommentStart => loop { + match get_char!(self, input) { + '-' => go!(self: to CommentStartDash), + '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment), + '>' => go!(self: error; emit_comment; to Data), + c => go!(self: push_comment c; to Comment), + } + }, + + //§ comment-start-dash-state + states::CommentStartDash => loop { + match get_char!(self, input) { + '-' => go!(self: to CommentEnd), + '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment), + '>' => go!(self: error; emit_comment; to Data), + c => go!(self: push_comment '-'; push_comment c; to Comment), + } + }, + + //§ comment-state + states::Comment => loop { + match get_char!(self, input) { + c @ '<' => go!(self: push_comment c; to CommentLessThanSign), + '-' => go!(self: to CommentEndDash), + '\0' => go!(self: error; push_comment '\u{fffd}'), + c => go!(self: push_comment c), + } + }, + + //§ comment-less-than-sign-state + states::CommentLessThanSign => loop { + match get_char!(self, input) { + c @ '!' => go!(self: push_comment c; to CommentLessThanSignBang), + c @ '<' => go!(self: push_comment c), + _ => go!(self: reconsume Comment), + } + }, + + //§ comment-less-than-sign-bang + states::CommentLessThanSignBang => loop { + match get_char!(self, input) { + '-' => go!(self: to CommentLessThanSignBangDash), + _ => go!(self: reconsume Comment), + } + }, + + //§ comment-less-than-sign-bang-dash + states::CommentLessThanSignBangDash => loop { + match get_char!(self, input) { + '-' => go!(self: to CommentLessThanSignBangDashDash), + _ => go!(self: reconsume CommentEndDash), + } + }, + + //§ comment-less-than-sign-bang-dash-dash + states::CommentLessThanSignBangDashDash => loop { + match get_char!(self, input) { + '>' => go!(self: reconsume CommentEnd), + _ => go!(self: error; reconsume CommentEnd), + } + }, + + //§ comment-end-dash-state + states::CommentEndDash => loop { + match get_char!(self, input) { + '-' => go!(self: to CommentEnd), + '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment), + c => go!(self: push_comment '-'; push_comment c; to Comment), + } + }, + + //§ comment-end-state + states::CommentEnd => loop { + match get_char!(self, input) { + '>' => go!(self: emit_comment; to Data), + '!' => go!(self: to CommentEndBang), + '-' => go!(self: push_comment '-'), + _ => go!(self: append_comment "--"; reconsume Comment), + } + }, + + //§ comment-end-bang-state + states::CommentEndBang => loop { + match get_char!(self, input) { + '-' => go!(self: append_comment "--!"; to CommentEndDash), + '>' => go!(self: error; emit_comment; to Data), + '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment), + c => go!(self: append_comment "--!"; push_comment c; to Comment), + } + }, + + //§ doctype-state + states::Doctype => loop { + match get_char!(self, input) { + '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName), + '>' => go!(self: reconsume BeforeDoctypeName), + _ => go!(self: error; reconsume BeforeDoctypeName), + } + }, + + //§ before-doctype-name-state + states::BeforeDoctypeName => loop { + match get_char!(self, input) { + '\t' | '\n' | '\x0C' | ' ' => (), + '\0' => { + go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName) + }, + '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data), + c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase()); + to DoctypeName), + } + }, + + //§ doctype-name-state + states::DoctypeName => loop { + match get_char!(self, input) { + '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName), + '>' => go!(self: emit_doctype; to Data), + '\0' => go!(self: error; push_doctype_name '\u{fffd}'), + c => go!(self: push_doctype_name (c.to_ascii_lowercase())), + } + }, + + //§ after-doctype-name-state + states::AfterDoctypeName => loop { + if eat!(self, input, "public") { + go!(self: to AfterDoctypeKeyword Public); + } else if eat!(self, input, "system") { + go!(self: to AfterDoctypeKeyword System); + } else { + match get_char!(self, input) { + '\t' | '\n' | '\x0C' | ' ' => (), + '>' => go!(self: emit_doctype; to Data), + _ => go!(self: error; force_quirks; reconsume BogusDoctype), + } + } + }, + + //§ after-doctype-public-keyword-state after-doctype-system-keyword-state + states::AfterDoctypeKeyword(kind) => loop { + match get_char!(self, input) { + '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind), + '"' => { + go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind) + }, + '\'' => { + go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind) + }, + '>' => go!(self: error; force_quirks; emit_doctype; to Data), + _ => go!(self: error; force_quirks; reconsume BogusDoctype), + } + }, + + //§ before-doctype-public-identifier-state before-doctype-system-identifier-state + states::BeforeDoctypeIdentifier(kind) => loop { + match get_char!(self, input) { + '\t' | '\n' | '\x0C' | ' ' => (), + '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind), + '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind), + '>' => go!(self: error; force_quirks; emit_doctype; to Data), + _ => go!(self: error; force_quirks; reconsume BogusDoctype), + } + }, + + //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state + states::DoctypeIdentifierDoubleQuoted(kind) => loop { + match get_char!(self, input) { + '"' => go!(self: to AfterDoctypeIdentifier kind), + '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'), + '>' => go!(self: error; force_quirks; emit_doctype; to Data), + c => go!(self: push_doctype_id kind c), + } + }, + + //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state + states::DoctypeIdentifierSingleQuoted(kind) => loop { + match get_char!(self, input) { + '\'' => go!(self: to AfterDoctypeIdentifier kind), + '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'), + '>' => go!(self: error; force_quirks; emit_doctype; to Data), + c => go!(self: push_doctype_id kind c), + } + }, + + //§ after-doctype-public-identifier-state + states::AfterDoctypeIdentifier(Public) => loop { + match get_char!(self, input) { + '\t' | '\n' | '\x0C' | ' ' => { + go!(self: to BetweenDoctypePublicAndSystemIdentifiers) + }, + '>' => go!(self: emit_doctype; to Data), + '"' => { + go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) + }, + '\'' => { + go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) + }, + _ => go!(self: error; force_quirks; reconsume BogusDoctype), + } + }, + + //§ after-doctype-system-identifier-state + states::AfterDoctypeIdentifier(System) => loop { + match get_char!(self, input) { + '\t' | '\n' | '\x0C' | ' ' => (), + '>' => go!(self: emit_doctype; to Data), + _ => go!(self: error; reconsume BogusDoctype), + } + }, + + //§ between-doctype-public-and-system-identifiers-state + states::BetweenDoctypePublicAndSystemIdentifiers => loop { + match get_char!(self, input) { + '\t' | '\n' | '\x0C' | ' ' => (), + '>' => go!(self: emit_doctype; to Data), + '"' => { + go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) + }, + '\'' => { + go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) + }, + _ => go!(self: error; force_quirks; reconsume BogusDoctype), + } + }, + + //§ bogus-doctype-state + states::BogusDoctype => loop { + match get_char!(self, input) { + '>' => go!(self: emit_doctype; to Data), + '\0' => go!(self: error), + _ => (), + } + }, + + //§ bogus-comment-state + states::BogusComment => loop { + match get_char!(self, input) { + '>' => go!(self: emit_comment; to Data), + '\0' => go!(self: error; push_comment '\u{fffd}'), + c => go!(self: push_comment c), + } + }, + + //§ markup-declaration-open-state + states::MarkupDeclarationOpen => loop { + if eat_exact!(self, input, "--") { + go!(self: clear_comment; to CommentStart); + } else if eat!(self, input, "doctype") { + go!(self: to Doctype); + } else { + if self + .sink + .adjusted_current_node_present_but_not_in_html_namespace() + && eat_exact!(self, input, "[CDATA[") + { + go!(self: clear_temp; to CdataSection); + } + go!(self: error; clear_comment; to BogusComment); + } + }, + + //§ cdata-section-state + states::CdataSection => loop { + match get_char!(self, input) { + ']' => go!(self: to CdataSectionBracket), + '\0' => go!(self: emit_temp; emit '\0'), + c => go!(self: push_temp c), + } + }, + + //§ cdata-section-bracket + states::CdataSectionBracket => match get_char!(self, input) { + ']' => go!(self: to CdataSectionEnd), + _ => go!(self: push_temp ']'; reconsume CdataSection), + }, + + //§ cdata-section-end + states::CdataSectionEnd => loop { + match get_char!(self, input) { + ']' => go!(self: push_temp ']'), + '>' => go!(self: emit_temp; to Data), + _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection), + } + }, + //§ END + } + } + + fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult { + // FIXME HACK: Take and replace the tokenizer so we don't + // double-mut-borrow self. This is why it's boxed. + let mut tok = self.char_ref_tokenizer.take().unwrap(); + let outcome = tok.step(self, input); + + let progress = match outcome { + char_ref::Done => { + self.process_char_ref(tok.get_result()); + return ProcessResult::Continue; + }, + + char_ref::Stuck => ProcessResult::Suspend, + char_ref::Progress => ProcessResult::Continue, + }; + + *self.char_ref_tokenizer.borrow_mut() = Some(tok); + progress + } + + fn process_char_ref(&self, char_ref: CharRef) { + let CharRef { + mut chars, + mut num_chars, + } = char_ref; + + if num_chars == 0 { + chars[0] = '&'; + num_chars = 1; + } + + for i in 0..num_chars { + let c = chars[i as usize]; + match self.state.get() { + states::Data | states::RawData(states::Rcdata) => go!(self: emit c), + + states::AttributeValue(_) => go!(self: push_value c), + + _ => panic!( + "state {:?} should not be reachable in process_char_ref", + self.state.get() + ), + } + } + } + + /// Indicate that we have reached the end of the input. + pub fn end(&self) { + // Handle EOF in the char ref sub-tokenizer, if there is one. + // Do this first because it might un-consume stuff. + let input = BufferQueue::default(); + match self.char_ref_tokenizer.take() { + None => (), + Some(mut tok) => { + tok.end_of_file(self, &input); + self.process_char_ref(tok.get_result()); + }, + } + + // Process all remaining buffered input. + // If we're waiting for lookahead, we're not gonna get it. + self.at_eof.set(true); + assert!(matches!(self.run(&input), TokenizerResult::Done)); + assert!(input.is_empty()); + + loop { + match self.eof_step() { + ProcessResult::Continue => (), + ProcessResult::Suspend => break, + ProcessResult::Script(_) => unreachable!(), + } + } + + self.sink.end(); + + if self.opts.profile { + self.dump_profile(); + } + } + + fn dump_profile(&self) { + let mut results: Vec<(states::State, u64)> = self + .state_profile + .borrow() + .iter() + .map(|(s, t)| (*s, *t)) + .collect(); + results.sort_by(|&(_, x), &(_, y)| y.cmp(&x)); + + let total: u64 = results + .iter() + .map(|&(_, t)| t) + .fold(0, ::std::ops::Add::add); + println!("\nTokenizer profile, in nanoseconds"); + println!( + "\n{:12} total in token sink", + self.time_in_sink.get() + ); + println!("\n{total:12} total in tokenizer"); + + for (k, v) in results.into_iter() { + let pct = 100.0 * (v as f64) / (total as f64); + println!("{v:12} {pct:4.1}% {k:?}"); + } + } + + fn eof_step(&self) -> ProcessResult { + debug!("processing EOF in state {:?}", self.state.get()); + match self.state.get() { + states::Data + | states::RawData(Rcdata) + | states::RawData(Rawtext) + | states::RawData(ScriptData) + | states::Plaintext => go!(self: eof), + + states::TagName + | states::RawData(ScriptDataEscaped(_)) + | states::BeforeAttributeName + | states::AttributeName + | states::AfterAttributeName + | states::AttributeValue(_) + | states::AfterAttributeValueQuoted + | states::SelfClosingStartTag + | states::ScriptDataEscapedDash(_) + | states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data), + + states::BeforeAttributeValue => go!(self: reconsume AttributeValue Unquoted), + + states::TagOpen => go!(self: error_eof; emit '<'; to Data), + + states::EndTagOpen => go!(self: error_eof; emit '<'; emit '/'; to Data), + + states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => { + go!(self: to RawData ScriptDataEscaped DoubleEscaped) + }, + + states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind), + + states::RawEndTagOpen(kind) => go!(self: emit '<'; emit '/'; to RawData kind), + + states::RawEndTagName(kind) => { + go!(self: emit '<'; emit '/'; emit_temp; to RawData kind) + }, + + states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind), + + states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData), + + states::ScriptDataDoubleEscapeEnd => { + go!(self: to RawData ScriptDataEscaped DoubleEscaped) + }, + + states::CommentStart + | states::CommentStartDash + | states::Comment + | states::CommentEndDash + | states::CommentEnd + | states::CommentEndBang => go!(self: error_eof; emit_comment; to Data), + + states::CommentLessThanSign | states::CommentLessThanSignBang => { + go!(self: reconsume Comment) + }, + + states::CommentLessThanSignBangDash => go!(self: reconsume CommentEndDash), + + states::CommentLessThanSignBangDashDash => go!(self: reconsume CommentEnd), + + states::Doctype | states::BeforeDoctypeName => { + go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data) + }, + + states::DoctypeName + | states::AfterDoctypeName + | states::AfterDoctypeKeyword(_) + | states::BeforeDoctypeIdentifier(_) + | states::DoctypeIdentifierDoubleQuoted(_) + | states::DoctypeIdentifierSingleQuoted(_) + | states::AfterDoctypeIdentifier(_) + | states::BetweenDoctypePublicAndSystemIdentifiers => { + go!(self: error_eof; force_quirks; emit_doctype; to Data) + }, + + states::BogusDoctype => go!(self: emit_doctype; to Data), + + states::BogusComment => go!(self: emit_comment; to Data), + + states::MarkupDeclarationOpen => go!(self: error; to BogusComment), + + states::CdataSection => go!(self: emit_temp; error_eof; to Data), + + states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection), + + states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection), + } + } +} + +#[cfg(test)] +#[allow(non_snake_case)] +mod test { + use super::option_push; // private items + use crate::tendril::{SliceExt, StrTendril}; + + use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; + + use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; + use super::interface::{EndTag, StartTag, Tag, TagKind}; + use super::interface::{TagToken, Token}; + + use markup5ever::buffer_queue::BufferQueue; + use std::cell::RefCell; + + use crate::LocalName; + + // LinesMatch implements the TokenSink trait. It is used for testing to see + // if current_line is being updated when process_token is called. The lines + // vector is a collection of the line numbers that each token is on. + struct LinesMatch { + tokens: RefCell>, + current_str: RefCell, + lines: RefCell>, + } + + impl LinesMatch { + fn new() -> LinesMatch { + LinesMatch { + tokens: RefCell::new(vec![]), + current_str: RefCell::new(StrTendril::new()), + lines: RefCell::new(vec![]), + } + } + + fn push(&self, token: Token, line_number: u64) { + self.finish_str(); + self.lines.borrow_mut().push((token, line_number)); + } + + fn finish_str(&self) { + if self.current_str.borrow().len() > 0 { + let s = self.current_str.take(); + self.tokens.borrow_mut().push(CharacterTokens(s)); + } + } + } + + impl TokenSink for LinesMatch { + type Handle = (); + + fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult { + match token { + CharacterTokens(b) => { + self.current_str.borrow_mut().push_slice(&b); + }, + + NullCharacterToken => { + self.current_str.borrow_mut().push_char('\0'); + }, + + ParseError(_) => { + panic!("unexpected parse error"); + }, + + TagToken(mut t) => { + // The spec seems to indicate that one can emit + // erroneous end tags with attrs, but the test + // cases don't contain them. + match t.kind { + EndTag => { + t.self_closing = false; + t.attrs = vec![]; + }, + _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)), + } + self.push(TagToken(t), line_number); + }, + + EOFToken => (), + + _ => self.push(token, line_number), + } + TokenSinkResult::Continue + } + } + + // Take in tokens, process them, and return vector with line + // numbers that each token is on + fn tokenize(input: Vec, opts: TokenizerOpts) -> Vec<(Token, u64)> { + let sink = LinesMatch::new(); + let tok = Tokenizer::new(sink, opts); + let buffer = BufferQueue::default(); + for chunk in input.into_iter() { + buffer.push_back(chunk); + let _ = tok.feed(&buffer); + } + tok.end(); + tok.sink.lines.take() + } + + // Create a tag token + fn create_tag(token: StrTendril, tagkind: TagKind) -> Token { + let name = LocalName::from(&*token); + + TagToken(Tag { + kind: tagkind, + name, + self_closing: false, + attrs: vec![], + }) + } + + #[test] + fn push_to_None_gives_singleton() { + let mut s: Option = None; + option_push(&mut s, 'x'); + assert_eq!(s, Some("x".to_tendril())); + } + + #[test] + fn push_to_empty_appends() { + let mut s: Option = Some(StrTendril::new()); + option_push(&mut s, 'x'); + assert_eq!(s, Some("x".to_tendril())); + } + + #[test] + fn push_to_nonempty_appends() { + let mut s: Option = Some(StrTendril::from_slice("y")); + option_push(&mut s, 'x'); + assert_eq!(s, Some("yx".to_tendril())); + } + + #[test] + fn check_lines() { + let opts = TokenizerOpts { + exact_errors: false, + discard_bom: true, + profile: false, + initial_state: None, + last_start_tag_name: None, + }; + let vector = vec![ + StrTendril::from("\n"), + StrTendril::from("\n"), + StrTendril::from("\n"), + StrTendril::from("\n"), + ]; + let expected = vec![ + (create_tag(StrTendril::from("a"), StartTag), 1), + (create_tag(StrTendril::from("b"), StartTag), 2), + (create_tag(StrTendril::from("b"), EndTag), 3), + (create_tag(StrTendril::from("a"), EndTag), 4), + ]; + let results = tokenize(vector, opts); + assert_eq!(results, expected); + } + + #[test] + fn check_lines_with_new_line() { + let opts = TokenizerOpts { + exact_errors: false, + discard_bom: true, + profile: false, + initial_state: None, + last_start_tag_name: None, + }; + let vector = vec![ + StrTendril::from("\r\n"), + StrTendril::from("\r\n"), + StrTendril::from("\r\n"), + StrTendril::from("\r\n"), + ]; + let expected = vec![ + (create_tag(StrTendril::from("a"), StartTag), 1), + (create_tag(StrTendril::from("b"), StartTag), 2), + (create_tag(StrTendril::from("b"), EndTag), 3), + (create_tag(StrTendril::from("a"), EndTag), 4), + ]; + let results = tokenize(vector, opts); + assert_eq!(results, expected); + } +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/states.rs b/collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/states.rs new file mode 100644 index 000000000..3c3201880 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/src/tokenizer/states.rs @@ -0,0 +1,97 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Tokenizer states. +//! +//! This is public for use by the tokenizer tests. Other library +//! users should not have to care about this. + +pub use self::AttrValueKind::*; +pub use self::DoctypeIdKind::*; +pub use self::RawKind::*; +pub use self::ScriptEscapeKind::*; +pub use self::State::*; + +#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] +pub enum ScriptEscapeKind { + Escaped, + DoubleEscaped, +} + +#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] +pub enum DoctypeIdKind { + Public, + System, +} + +#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] +pub enum RawKind { + Rcdata, + Rawtext, + ScriptData, + ScriptDataEscaped(ScriptEscapeKind), +} + +#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] +pub enum AttrValueKind { + Unquoted, + SingleQuoted, + DoubleQuoted, +} + +#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] +pub enum State { + Data, + Plaintext, + TagOpen, + EndTagOpen, + TagName, + RawData(RawKind), + RawLessThanSign(RawKind), + RawEndTagOpen(RawKind), + RawEndTagName(RawKind), + ScriptDataEscapeStart(ScriptEscapeKind), + ScriptDataEscapeStartDash, + ScriptDataEscapedDash(ScriptEscapeKind), + ScriptDataEscapedDashDash(ScriptEscapeKind), + ScriptDataDoubleEscapeEnd, + BeforeAttributeName, + AttributeName, + AfterAttributeName, + BeforeAttributeValue, + AttributeValue(AttrValueKind), + AfterAttributeValueQuoted, + SelfClosingStartTag, + BogusComment, + MarkupDeclarationOpen, + CommentStart, + CommentStartDash, + Comment, + CommentLessThanSign, + CommentLessThanSignBang, + CommentLessThanSignBangDash, + CommentLessThanSignBangDashDash, + CommentEndDash, + CommentEnd, + CommentEndBang, + Doctype, + BeforeDoctypeName, + DoctypeName, + AfterDoctypeName, + AfterDoctypeKeyword(DoctypeIdKind), + BeforeDoctypeIdentifier(DoctypeIdKind), + DoctypeIdentifierDoubleQuoted(DoctypeIdKind), + DoctypeIdentifierSingleQuoted(DoctypeIdKind), + AfterDoctypeIdentifier(DoctypeIdKind), + BetweenDoctypePublicAndSystemIdentifiers, + BogusDoctype, + CdataSection, + CdataSectionBracket, + CdataSectionEnd, +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/data.rs b/collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/data.rs new file mode 100644 index 000000000..f18e40e8e --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/data.rs @@ -0,0 +1,170 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use crate::interface::{LimitedQuirks, NoQuirks, Quirks, QuirksMode}; +use crate::tendril::StrTendril; +use crate::tokenizer::Doctype; + +// These should all be lowercase, for ASCII-case-insensitive matching. +static QUIRKY_PUBLIC_PREFIXES: &[&str] = &[ + "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", + "-//as//dtd html 3.0 aswedit + extensions//", + "-//ietf//dtd html 2.0 level 1//", + "-//ietf//dtd html 2.0 level 2//", + "-//ietf//dtd html 2.0 strict level 1//", + "-//ietf//dtd html 2.0 strict level 2//", + "-//ietf//dtd html 2.0 strict//", + "-//ietf//dtd html 2.0//", + "-//ietf//dtd html 2.1e//", + "-//ietf//dtd html 3.0//", + "-//ietf//dtd html 3.2 final//", + "-//ietf//dtd html 3.2//", + "-//ietf//dtd html 3//", + "-//ietf//dtd html level 0//", + "-//ietf//dtd html level 1//", + "-//ietf//dtd html level 2//", + "-//ietf//dtd html level 3//", + "-//ietf//dtd html strict level 0//", + "-//ietf//dtd html strict level 1//", + "-//ietf//dtd html strict level 2//", + "-//ietf//dtd html strict level 3//", + "-//ietf//dtd html strict//", + "-//ietf//dtd html//", + "-//metrius//dtd metrius presentational//", + "-//microsoft//dtd internet explorer 2.0 html strict//", + "-//microsoft//dtd internet explorer 2.0 html//", + "-//microsoft//dtd internet explorer 2.0 tables//", + "-//microsoft//dtd internet explorer 3.0 html strict//", + "-//microsoft//dtd internet explorer 3.0 html//", + "-//microsoft//dtd internet explorer 3.0 tables//", + "-//netscape comm. corp.//dtd html//", + "-//netscape comm. corp.//dtd strict html//", + "-//o'reilly and associates//dtd html 2.0//", + "-//o'reilly and associates//dtd html extended 1.0//", + "-//o'reilly and associates//dtd html extended relaxed 1.0//", + "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", + "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", + "-//spyglass//dtd html 2.0 extended//", + "-//sq//dtd html 2.0 hotmetal + extensions//", + "-//sun microsystems corp.//dtd hotjava html//", + "-//sun microsystems corp.//dtd hotjava strict html//", + "-//w3c//dtd html 3 1995-03-24//", + "-//w3c//dtd html 3.2 draft//", + "-//w3c//dtd html 3.2 final//", + "-//w3c//dtd html 3.2//", + "-//w3c//dtd html 3.2s draft//", + "-//w3c//dtd html 4.0 frameset//", + "-//w3c//dtd html 4.0 transitional//", + "-//w3c//dtd html experimental 19960712//", + "-//w3c//dtd html experimental 970421//", + "-//w3c//dtd w3 html//", + "-//w3o//dtd w3 html 3.0//", + "-//webtechs//dtd mozilla html 2.0//", + "-//webtechs//dtd mozilla html//", +]; + +static QUIRKY_PUBLIC_MATCHES: &[&str] = &[ + "-//w3o//dtd w3 html strict 3.0//en//", + "-/w3c/dtd html 4.0 transitional/en", + "html", +]; + +static QUIRKY_SYSTEM_MATCHES: &[&str] = + &["http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"]; + +static LIMITED_QUIRKY_PUBLIC_PREFIXES: &[&str] = &[ + "-//w3c//dtd xhtml 1.0 frameset//", + "-//w3c//dtd xhtml 1.0 transitional//", +]; + +static HTML4_PUBLIC_PREFIXES: &[&str] = &[ + "-//w3c//dtd html 4.01 frameset//", + "-//w3c//dtd html 4.01 transitional//", +]; + +pub(crate) fn doctype_error_and_quirks( + doctype: &Doctype, + iframe_srcdoc: bool, +) -> (bool, QuirksMode) { + fn opt_string_as_slice(x: &Option) -> Option<&str> { + x.as_deref() + } + + fn opt_tendril_as_slice(x: &Option) -> Option<&str> { + x.as_deref() + } + + fn opt_to_ascii_lower(x: Option<&str>) -> Option { + x.map(|y| y.to_ascii_lowercase()) + } + + let name = opt_tendril_as_slice(&doctype.name); + let public = opt_tendril_as_slice(&doctype.public_id); + let system = opt_tendril_as_slice(&doctype.system_id); + + let err = !matches!( + (name, public, system), + (Some("html"), None, None) + | (Some("html"), None, Some("about:legacy-compat")) + | (Some("html"), Some("-//W3C//DTD HTML 4.0//EN"), None) + | ( + Some("html"), + Some("-//W3C//DTD HTML 4.0//EN"), + Some("http://www.w3.org/TR/REC-html40/strict.dtd"), + ) + | (Some("html"), Some("-//W3C//DTD HTML 4.01//EN"), None) + | ( + Some("html"), + Some("-//W3C//DTD HTML 4.01//EN"), + Some("http://www.w3.org/TR/html4/strict.dtd"), + ) + | ( + Some("html"), + Some("-//W3C//DTD XHTML 1.0 Strict//EN"), + Some("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"), + ) + | ( + Some("html"), + Some("-//W3C//DTD XHTML 1.1//EN"), + Some("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"), + ) + ); + + // FIXME: We could do something asymptotically faster here. + // But there aren't many strings, and this happens at most once per parse. + fn contains_pfx(haystack: &[&str], needle: &str) -> bool { + haystack.iter().any(|&x| needle.starts_with(x)) + } + + // Quirks-mode matches are case-insensitive. + let public = opt_to_ascii_lower(public); + let system = opt_to_ascii_lower(system); + + let quirk = match (opt_string_as_slice(&public), opt_string_as_slice(&system)) { + _ if doctype.force_quirks => Quirks, + _ if name != Some("html") => Quirks, + + _ if iframe_srcdoc => NoQuirks, + + (Some(ref p), _) if QUIRKY_PUBLIC_MATCHES.contains(p) => Quirks, + (_, Some(ref s)) if QUIRKY_SYSTEM_MATCHES.contains(s) => Quirks, + + (Some(p), _) if contains_pfx(QUIRKY_PUBLIC_PREFIXES, p) => Quirks, + (Some(p), _) if contains_pfx(LIMITED_QUIRKY_PUBLIC_PREFIXES, p) => LimitedQuirks, + + (Some(p), s) if contains_pfx(HTML4_PUBLIC_PREFIXES, p) => match s { + None => Quirks, + Some(_) => LimitedQuirks, + }, + + _ => NoQuirks, + }; + + (err, quirk) +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/mod.rs b/collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/mod.rs new file mode 100644 index 000000000..f3ce1515d --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/mod.rs @@ -0,0 +1,1789 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! The HTML5 tree builder. + +pub use crate::interface::{create_element, ElemName, ElementFlags, Tracer, TreeSink}; +pub use crate::interface::{AppendNode, AppendText, Attribute, NodeOrText}; +pub use crate::interface::{LimitedQuirks, NoQuirks, Quirks, QuirksMode}; + +use self::types::*; + +use crate::tendril::StrTendril; +use crate::{ExpandedName, LocalName, Namespace, QualName}; + +use crate::tokenizer; +use crate::tokenizer::states as tok_state; +use crate::tokenizer::{Doctype, EndTag, StartTag, Tag, TokenSink, TokenSinkResult}; + +use std::borrow::Cow::Borrowed; +use std::cell::{Cell, Ref, RefCell}; +use std::collections::VecDeque; +use std::iter::{Enumerate, Rev}; +use std::{fmt, slice}; + +use crate::tokenizer::states::RawKind; +use crate::tree_builder::tag_sets::*; +use crate::util::str::to_escaped_string; +use log::{debug, log_enabled, warn, Level}; +use mac::format_if; +use markup5ever::{expanded_name, local_name, namespace_prefix, namespace_url, ns}; + +pub use self::PushFlag::*; + +#[macro_use] +mod tag_sets; + +mod data; +mod rules; +mod types; + +/// Tree builder options, with an impl for Default. +#[derive(Copy, Clone)] +pub struct TreeBuilderOpts { + /// Report all parse errors described in the spec, at some + /// performance penalty? Default: false + pub exact_errors: bool, + + /// Is scripting enabled? + pub scripting_enabled: bool, + + /// Is this an `iframe srcdoc` document? + pub iframe_srcdoc: bool, + + /// Should we drop the DOCTYPE (if any) from the tree? + pub drop_doctype: bool, + + /// Obsolete, ignored. + pub ignore_missing_rules: bool, + + /// Initial TreeBuilder quirks mode. Default: NoQuirks + pub quirks_mode: QuirksMode, +} + +impl Default for TreeBuilderOpts { + fn default() -> TreeBuilderOpts { + TreeBuilderOpts { + exact_errors: false, + scripting_enabled: true, + iframe_srcdoc: false, + drop_doctype: false, + ignore_missing_rules: false, + quirks_mode: NoQuirks, + } + } +} + +/// The HTML tree builder. +pub struct TreeBuilder { + /// Options controlling the behavior of the tree builder. + opts: TreeBuilderOpts, + + /// Consumer of tree modifications. + pub sink: Sink, + + /// Insertion mode. + mode: Cell, + + /// Original insertion mode, used by Text and InTableText modes. + orig_mode: Cell>, + + /// Stack of template insertion modes. + template_modes: RefCell>, + + /// Pending table character tokens. + pending_table_text: RefCell>, + + /// Quirks mode as set by the parser. + /// FIXME: can scripts etc. change this? + quirks_mode: Cell, + + /// The document node, which is created by the sink. + doc_handle: Handle, + + /// Stack of open elements, most recently added at end. + open_elems: RefCell>, + + /// List of active formatting elements. + active_formatting: RefCell>>, + + //§ the-element-pointers + /// Head element pointer. + head_elem: RefCell>, + + /// Form element pointer. + form_elem: RefCell>, + //§ END + /// Frameset-ok flag. + frameset_ok: Cell, + + /// Ignore a following U+000A LINE FEED? + ignore_lf: Cell, + + /// Is foster parenting enabled? + foster_parenting: Cell, + + /// The context element for the fragment parsing algorithm. + context_elem: RefCell>, + + /// Track current line + current_line: Cell, + // WARNING: If you add new fields that contain Handles, you + // must add them to trace_handles() below to preserve memory + // safety! + // + // FIXME: Auto-generate the trace hooks like Servo does. +} + +impl TreeBuilder +where + Handle: Clone, + Sink: TreeSink, +{ + /// Create a new tree builder which sends tree modifications to a particular `TreeSink`. + /// + /// The tree builder is also a `TokenSink`. + pub fn new(sink: Sink, opts: TreeBuilderOpts) -> TreeBuilder { + let doc_handle = sink.get_document(); + TreeBuilder { + opts, + sink, + mode: Cell::new(Initial), + orig_mode: Cell::new(None), + template_modes: Default::default(), + pending_table_text: Default::default(), + quirks_mode: Cell::new(opts.quirks_mode), + doc_handle, + open_elems: Default::default(), + active_formatting: Default::default(), + head_elem: Default::default(), + form_elem: Default::default(), + frameset_ok: Cell::new(true), + ignore_lf: Default::default(), + foster_parenting: Default::default(), + context_elem: Default::default(), + current_line: Cell::new(1), + } + } + + /// Create a new tree builder which sends tree modifications to a particular `TreeSink`. + /// This is for parsing fragments. + /// + /// The tree builder is also a `TokenSink`. + pub fn new_for_fragment( + sink: Sink, + context_elem: Handle, + form_elem: Option, + opts: TreeBuilderOpts, + ) -> TreeBuilder { + let doc_handle = sink.get_document(); + let context_is_template = + sink.elem_name(&context_elem).expanded() == expanded_name!(html "template"); + let template_modes = if context_is_template { + RefCell::new(vec![InTemplate]) + } else { + RefCell::new(vec![]) + }; + + let tb = TreeBuilder { + opts, + sink, + mode: Cell::new(Initial), + orig_mode: Cell::new(None), + template_modes, + pending_table_text: Default::default(), + quirks_mode: Cell::new(opts.quirks_mode), + doc_handle, + open_elems: Default::default(), + active_formatting: Default::default(), + head_elem: Default::default(), + form_elem: RefCell::new(form_elem), + frameset_ok: Cell::new(true), + ignore_lf: Default::default(), + foster_parenting: Default::default(), + context_elem: RefCell::new(Some(context_elem)), + current_line: Cell::new(1), + }; + + // https://html.spec.whatwg.org/multipage/#parsing-html-fragments + // 5. Let root be a new html element with no attributes. + // 6. Append the element root to the Document node created above. + // 7. Set up the parser's stack of open elements so that it contains just the single element root. + tb.create_root(vec![]); + // 10. Reset the parser's insertion mode appropriately. + let old_insertion_mode = tb.reset_insertion_mode(); + tb.mode.set(old_insertion_mode); + + tb + } + + // https://html.spec.whatwg.org/multipage/#concept-frag-parse-context + // Step 4. Set the state of the HTML parser's tokenization stage as follows: + pub fn tokenizer_state_for_context_elem(&self) -> tok_state::State { + let context_elem = self.context_elem.borrow(); + let elem = context_elem.as_ref().expect("no context element"); + let elem_name = self.sink.elem_name(elem); + let name = match elem_name.expanded() { + ExpandedName { + ns: &ns!(html), + local, + } => local, + _ => return tok_state::Data, + }; + match *name { + local_name!("title") | local_name!("textarea") => tok_state::RawData(tok_state::Rcdata), + + local_name!("style") + | local_name!("xmp") + | local_name!("iframe") + | local_name!("noembed") + | local_name!("noframes") => tok_state::RawData(tok_state::Rawtext), + + local_name!("script") => tok_state::RawData(tok_state::ScriptData), + + local_name!("noscript") => { + if self.opts.scripting_enabled { + tok_state::RawData(tok_state::Rawtext) + } else { + tok_state::Data + } + }, + + local_name!("plaintext") => tok_state::Plaintext, + + _ => tok_state::Data, + } + } + + /// Call the `Tracer`'s `trace_handle` method on every `Handle` in the tree builder's + /// internal state. This is intended to support garbage-collected DOMs. + pub fn trace_handles(&self, tracer: &dyn Tracer) { + tracer.trace_handle(&self.doc_handle); + for e in &*self.open_elems.borrow() { + tracer.trace_handle(e); + } + + for e in &*self.active_formatting.borrow() { + if let FormatEntry::Element(handle, _) = e { + tracer.trace_handle(handle); + } + } + + if let Some(head_elem) = self.head_elem.borrow().as_ref() { + tracer.trace_handle(head_elem); + } + + if let Some(form_elem) = self.form_elem.borrow().as_ref() { + tracer.trace_handle(form_elem); + } + + if let Some(context_elem) = self.context_elem.borrow().as_ref() { + tracer.trace_handle(context_elem); + } + } + + #[allow(dead_code)] + fn dump_state(&self, label: String) { + println!("dump_state on {label}"); + print!(" open_elems:"); + for node in self.open_elems.borrow().iter() { + let name = self.sink.elem_name(node); + match *name.ns() { + ns!(html) => print!(" {}", name.local_name()), + _ => panic!(), + } + } + println!(); + print!(" active_formatting:"); + for entry in self.active_formatting.borrow().iter() { + match entry { + &Marker => print!(" Marker"), + Element(h, _) => { + let name = self.sink.elem_name(h); + match *name.ns() { + ns!(html) => print!(" {}", name.local_name()), + _ => panic!(), + } + }, + } + } + println!(); + } + + fn debug_step(&self, mode: InsertionMode, token: &Token) { + if log_enabled!(Level::Debug) { + debug!( + "processing {} in insertion mode {:?}", + to_escaped_string(token), + mode + ); + } + } + + fn process_to_completion(&self, mut token: Token) -> TokenSinkResult { + // Queue of additional tokens yet to be processed. + // This stays empty in the common case where we don't split whitespace. + let mut more_tokens = VecDeque::new(); + + loop { + let should_have_acknowledged_self_closing_flag = matches!( + token, + TagToken(Tag { + self_closing: true, + kind: StartTag, + .. + }) + ); + let result = if self.is_foreign(&token) { + self.step_foreign(token) + } else { + let mode = self.mode.get(); + self.step(mode, token) + }; + match result { + Done => { + if should_have_acknowledged_self_closing_flag { + self.sink + .parse_error(Borrowed("Unacknowledged self-closing tag")); + } + token = unwrap_or_return!( + more_tokens.pop_front(), + tokenizer::TokenSinkResult::Continue + ); + }, + DoneAckSelfClosing => { + token = unwrap_or_return!( + more_tokens.pop_front(), + tokenizer::TokenSinkResult::Continue + ); + }, + Reprocess(m, t) => { + self.mode.set(m); + token = t; + }, + ReprocessForeign(t) => { + token = t; + }, + SplitWhitespace(mut buf) => { + let p = buf.pop_front_char_run(|c| c.is_ascii_whitespace()); + let (first, is_ws) = unwrap_or_return!(p, tokenizer::TokenSinkResult::Continue); + let status = if is_ws { Whitespace } else { NotWhitespace }; + token = CharacterTokens(status, first); + + if buf.len32() > 0 { + more_tokens.push_back(CharacterTokens(NotSplit, buf)); + } + }, + Script(node) => { + assert!(more_tokens.is_empty()); + return tokenizer::TokenSinkResult::Script(node); + }, + ToPlaintext => { + assert!(more_tokens.is_empty()); + return tokenizer::TokenSinkResult::Plaintext; + }, + ToRawData(k) => { + assert!(more_tokens.is_empty()); + return tokenizer::TokenSinkResult::RawData(k); + }, + } + } + } + + /// Are we parsing a HTML fragment? + pub fn is_fragment(&self) -> bool { + self.context_elem.borrow().is_some() + } + + /// https://html.spec.whatwg.org/multipage/#appropriate-place-for-inserting-a-node + fn appropriate_place_for_insertion( + &self, + override_target: Option, + ) -> InsertionPoint { + use self::tag_sets::*; + + declare_tag_set!(foster_target = "table" "tbody" "tfoot" "thead" "tr"); + let target = override_target.unwrap_or_else(|| self.current_node().clone()); + if !(self.foster_parenting.get() && self.elem_in(&target, foster_target)) { + if self.html_elem_named(&target, local_name!("template")) { + // No foster parenting (inside template). + let contents = self.sink.get_template_contents(&target); + return LastChild(contents); + } else { + // No foster parenting (the common case). + return LastChild(target); + } + } + + // Foster parenting + let open_elems = self.open_elems.borrow(); + let mut iter = open_elems.iter().rev().peekable(); + while let Some(elem) = iter.next() { + if self.html_elem_named(elem, local_name!("template")) { + let contents = self.sink.get_template_contents(elem); + return LastChild(contents); + } else if self.html_elem_named(elem, local_name!("table")) { + return TableFosterParenting { + element: elem.clone(), + prev_element: (*iter.peek().unwrap()).clone(), + }; + } + } + let html_elem = self.html_elem(); + LastChild(html_elem.clone()) + } + + fn insert_at(&self, insertion_point: InsertionPoint, child: NodeOrText) { + match insertion_point { + LastChild(parent) => self.sink.append(&parent, child), + BeforeSibling(sibling) => self.sink.append_before_sibling(&sibling, child), + TableFosterParenting { + element, + prev_element, + } => self + .sink + .append_based_on_parent_node(&element, &prev_element, child), + } + } +} + +impl TokenSink for TreeBuilder +where + Handle: Clone, + Sink: TreeSink, +{ + type Handle = Handle; + + fn process_token(&self, token: tokenizer::Token, line_number: u64) -> TokenSinkResult { + if line_number != self.current_line.get() { + self.sink.set_current_line(line_number); + } + let ignore_lf = self.ignore_lf.take(); + + // Handle `ParseError` and `DoctypeToken`; convert everything else to the local `Token` type. + let token = match token { + tokenizer::ParseError(e) => { + self.sink.parse_error(e); + return tokenizer::TokenSinkResult::Continue; + }, + + tokenizer::DoctypeToken(dt) => { + if self.mode.get() == Initial { + let (err, quirk) = data::doctype_error_and_quirks(&dt, self.opts.iframe_srcdoc); + if err { + self.sink.parse_error(format_if!( + self.opts.exact_errors, + "Bad DOCTYPE", + "Bad DOCTYPE: {:?}", + dt + )); + } + let Doctype { + name, + public_id, + system_id, + force_quirks: _, + } = dt; + if !self.opts.drop_doctype { + self.sink.append_doctype_to_document( + name.unwrap_or(StrTendril::new()), + public_id.unwrap_or(StrTendril::new()), + system_id.unwrap_or(StrTendril::new()), + ); + } + self.set_quirks_mode(quirk); + + self.mode.set(BeforeHtml); + return tokenizer::TokenSinkResult::Continue; + } else { + self.sink.parse_error(format_if!( + self.opts.exact_errors, + "DOCTYPE in body", + "DOCTYPE in insertion mode {:?}", + self.mode.get() + )); + return tokenizer::TokenSinkResult::Continue; + } + }, + + tokenizer::TagToken(x) => TagToken(x), + tokenizer::CommentToken(x) => CommentToken(x), + tokenizer::NullCharacterToken => NullCharacterToken, + tokenizer::EOFToken => EOFToken, + + tokenizer::CharacterTokens(mut x) => { + if ignore_lf && x.starts_with("\n") { + x.pop_front(1); + } + if x.is_empty() { + return tokenizer::TokenSinkResult::Continue; + } + CharacterTokens(NotSplit, x) + }, + }; + + self.process_to_completion(token) + } + + fn end(&self) { + for elem in self.open_elems.borrow_mut().drain(..).rev() { + self.sink.pop(&elem); + } + } + + fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool { + !self.open_elems.borrow().is_empty() + && *self.sink.elem_name(&self.adjusted_current_node()).ns() != ns!(html) + } +} + +pub fn html_elem(open_elems: &[Handle]) -> &Handle { + &open_elems[0] +} + +struct ActiveFormattingView<'a, Handle: 'a> { + data: Ref<'a, Vec>>, +} + +impl<'a, Handle: 'a> ActiveFormattingView<'a, Handle> { + fn iter(&'a self) -> impl Iterator + 'a { + ActiveFormattingIter { + iter: self.data.iter().enumerate().rev(), + } + } +} + +pub struct ActiveFormattingIter<'a, Handle: 'a> { + iter: Rev>>>, +} + +impl<'a, Handle> Iterator for ActiveFormattingIter<'a, Handle> { + type Item = (usize, &'a Handle, &'a Tag); + fn next(&mut self) -> Option<(usize, &'a Handle, &'a Tag)> { + match self.iter.next() { + None | Some((_, &Marker)) => None, + Some((i, Element(h, t))) => Some((i, h, t)), + } + } +} + +pub enum PushFlag { + Push, + NoPush, +} + +enum Bookmark { + Replace(Handle), + InsertAfter(Handle), +} + +macro_rules! qualname { + ("", $local:tt) => { + QualName { + prefix: None, + ns: ns!(), + local: local_name!($local), + } + }; + ($prefix: tt $ns:tt $local:tt) => { + QualName { + prefix: Some(namespace_prefix!($prefix)), + ns: ns!($ns), + local: local_name!($local), + } + }; +} + +#[doc(hidden)] +impl TreeBuilder +where + Handle: Clone, + Sink: TreeSink, +{ + fn unexpected(&self, _thing: &T) -> ProcessResult { + self.sink.parse_error(format_if!( + self.opts.exact_errors, + "Unexpected token", + "Unexpected token {} in insertion mode {:?}", + to_escaped_string(_thing), + self.mode.get() + )); + Done + } + + fn assert_named(&self, node: &Handle, name: LocalName) { + assert!(self.html_elem_named(node, name)); + } + + /// Iterate over the active formatting elements (with index in the list) from the end + /// to the last marker, or the beginning if there are no markers. + fn active_formatting_end_to_marker(&self) -> ActiveFormattingView<'_, Handle> { + ActiveFormattingView { + data: self.active_formatting.borrow(), + } + } + + fn position_in_active_formatting(&self, element: &Handle) -> Option { + self.active_formatting + .borrow() + .iter() + .position(|n| match n { + FormatEntry::Marker => false, + FormatEntry::Element(ref handle, _) => self.sink.same_node(handle, element), + }) + } + + fn set_quirks_mode(&self, mode: QuirksMode) { + self.quirks_mode.set(mode); + self.sink.set_quirks_mode(mode); + } + + fn stop_parsing(&self) -> ProcessResult { + Done + } + + //§ parsing-elements-that-contain-only-text + // Switch to `Text` insertion mode, save the old mode, and + // switch the tokenizer to a raw-data state. + // The latter only takes effect after the current / next + // `process_token` of a start tag returns! + fn to_raw_text_mode(&self, k: RawKind) -> ProcessResult { + self.orig_mode.set(Some(self.mode.get())); + self.mode.set(Text); + ToRawData(k) + } + + // The generic raw text / RCDATA parsing algorithm. + fn parse_raw_data(&self, tag: Tag, k: RawKind) -> ProcessResult { + self.insert_element_for(tag); + self.to_raw_text_mode(k) + } + //§ END + + fn current_node(&self) -> Ref { + Ref::map(self.open_elems.borrow(), |elems| { + elems.last().expect("no current element") + }) + } + + fn adjusted_current_node(&self) -> Ref { + if self.open_elems.borrow().len() == 1 { + let context_elem = self.context_elem.borrow(); + let ctx = Ref::filter_map(context_elem, |e| e.as_ref()); + if let Ok(ctx) = ctx { + return ctx; + } + } + self.current_node() + } + + fn current_node_in(&self, set: TagSet) -> bool + where + TagSet: Fn(ExpandedName) -> bool, + { + set(self.sink.elem_name(&self.current_node()).expanded()) + } + + // Insert at the "appropriate place for inserting a node". + fn insert_appropriately(&self, child: NodeOrText, override_target: Option) { + let insertion_point = self.appropriate_place_for_insertion(override_target); + self.insert_at(insertion_point, child); + } + + fn adoption_agency(&self, subject: LocalName) { + // 1. + if self.current_node_named(subject.clone()) + && self + .position_in_active_formatting(&self.current_node()) + .is_none() + { + self.pop(); + return; + } + + // 2. 3. 4. + for _ in 0..8 { + // 5. + let (fmt_elem_index, fmt_elem, fmt_elem_tag) = unwrap_or_return!( + // We clone the Handle and Tag so they don't cause an immutable borrow of self. + self.active_formatting_end_to_marker() + .iter() + .find(|&(_, _, tag)| tag.name == subject) + .map(|(i, h, t)| (i, h.clone(), t.clone())), + { + self.process_end_tag_in_body(Tag { + kind: EndTag, + name: subject, + self_closing: false, + attrs: vec![], + }); + } + ); + + let fmt_elem_stack_index = unwrap_or_return!( + self.open_elems + .borrow() + .iter() + .rposition(|n| self.sink.same_node(n, &fmt_elem)), + { + self.sink + .parse_error(Borrowed("Formatting element not open")); + self.active_formatting.borrow_mut().remove(fmt_elem_index); + } + ); + + // 7. + if !self.in_scope(default_scope, |n| self.sink.same_node(&n, &fmt_elem)) { + self.sink + .parse_error(Borrowed("Formatting element not in scope")); + return; + } + + // 8. + if !self.sink.same_node(&self.current_node(), &fmt_elem) { + self.sink + .parse_error(Borrowed("Formatting element not current node")); + } + + // 9. + let (furthest_block_index, furthest_block) = unwrap_or_return!( + self.open_elems + .borrow() + .iter() + .enumerate() + .skip(fmt_elem_stack_index) + .find(|&(_, open_element)| self.elem_in(open_element, special_tag)) + .map(|(i, h)| (i, h.clone())), + // 10. + { + self.open_elems.borrow_mut().truncate(fmt_elem_stack_index); + self.active_formatting.borrow_mut().remove(fmt_elem_index); + } + ); + + // 11. + let common_ancestor = self.open_elems.borrow()[fmt_elem_stack_index - 1].clone(); + + // 12. + let mut bookmark = Bookmark::Replace(fmt_elem.clone()); + + // 13. + let mut node; + let mut node_index = furthest_block_index; + let mut last_node = furthest_block.clone(); + + // 13.1. + let mut inner_counter = 0; + loop { + // 13.2. + inner_counter += 1; + + // 13.3. + node_index -= 1; + node = self.open_elems.borrow()[node_index].clone(); + + // 13.4. + if self.sink.same_node(&node, &fmt_elem) { + break; + } + + // 13.5. + if inner_counter > 3 { + self.position_in_active_formatting(&node) + .map(|position| self.active_formatting.borrow_mut().remove(position)); + self.open_elems.borrow_mut().remove(node_index); + continue; + } + + let node_formatting_index = unwrap_or_else!( + self.position_in_active_formatting(&node), + // 13.6. + { + self.open_elems.borrow_mut().remove(node_index); + continue; + } + ); + + // 13.7. + let tag = match self.active_formatting.borrow()[node_formatting_index] { + Element(ref h, ref t) => { + assert!(self.sink.same_node(h, &node)); + t.clone() + }, + Marker => panic!("Found marker during adoption agency"), + }; + // FIXME: Is there a way to avoid cloning the attributes twice here (once on their + // own, once as part of t.clone() above)? + let new_element = create_element( + &self.sink, + QualName::new(None, ns!(html), tag.name.clone()), + tag.attrs.clone(), + ); + self.open_elems.borrow_mut()[node_index] = new_element.clone(); + self.active_formatting.borrow_mut()[node_formatting_index] = + Element(new_element.clone(), tag); + node = new_element; + + // 13.8. + if self.sink.same_node(&last_node, &furthest_block) { + bookmark = Bookmark::InsertAfter(node.clone()); + } + + // 13.9. + self.sink.remove_from_parent(&last_node); + self.sink.append(&node, AppendNode(last_node.clone())); + + // 13.10. + last_node = node.clone(); + + // 13.11. + } + + // 14. + self.sink.remove_from_parent(&last_node); + self.insert_appropriately(AppendNode(last_node.clone()), Some(common_ancestor)); + + // 15. + // FIXME: Is there a way to avoid cloning the attributes twice here (once on their own, + // once as part of t.clone() above)? + let new_element = create_element( + &self.sink, + QualName::new(None, ns!(html), fmt_elem_tag.name.clone()), + fmt_elem_tag.attrs.clone(), + ); + let new_entry = Element(new_element.clone(), fmt_elem_tag); + + // 16. + self.sink.reparent_children(&furthest_block, &new_element); + + // 17. + self.sink + .append(&furthest_block, AppendNode(new_element.clone())); + + // 18. + // FIXME: We could probably get rid of the position_in_active_formatting() calls here + // if we had a more clever Bookmark representation. + match bookmark { + Bookmark::Replace(to_replace) => { + let index = self + .position_in_active_formatting(&to_replace) + .expect("bookmark not found in active formatting elements"); + self.active_formatting.borrow_mut()[index] = new_entry; + }, + Bookmark::InsertAfter(previous) => { + let index = self + .position_in_active_formatting(&previous) + .expect("bookmark not found in active formatting elements") + + 1; + self.active_formatting.borrow_mut().insert(index, new_entry); + let old_index = self + .position_in_active_formatting(&fmt_elem) + .expect("formatting element not found in active formatting elements"); + self.active_formatting.borrow_mut().remove(old_index); + }, + } + + // 19. + self.remove_from_stack(&fmt_elem); + let new_furthest_block_index = self + .open_elems + .borrow() + .iter() + .position(|n| self.sink.same_node(n, &furthest_block)) + .expect("furthest block missing from open element stack"); + self.open_elems + .borrow_mut() + .insert(new_furthest_block_index + 1, new_element); + + // 20. + } + } + + fn push(&self, elem: &Handle) { + self.open_elems.borrow_mut().push(elem.clone()); + } + + fn pop(&self) -> Handle { + let elem = self + .open_elems + .borrow_mut() + .pop() + .expect("no current element"); + self.sink.pop(&elem); + elem + } + + fn remove_from_stack(&self, elem: &Handle) { + let position = self + .open_elems + .borrow() + .iter() + .rposition(|x| self.sink.same_node(elem, x)); + if let Some(position) = position { + self.open_elems.borrow_mut().remove(position); + self.sink.pop(elem); + } + } + + fn is_marker_or_open(&self, entry: &FormatEntry) -> bool { + match *entry { + Marker => true, + Element(ref node, _) => self + .open_elems + .borrow() + .iter() + .rev() + .any(|n| self.sink.same_node(n, node)), + } + } + + /// Reconstruct the active formatting elements. + fn reconstruct_formatting(&self) { + { + let active_formatting = self.active_formatting.borrow(); + let last = unwrap_or_return!(active_formatting.last()); + if self.is_marker_or_open(last) { + return; + } + } + + let mut entry_index = self.active_formatting.borrow().len() - 1; + loop { + if entry_index == 0 { + break; + } + entry_index -= 1; + if self.is_marker_or_open(&self.active_formatting.borrow()[entry_index]) { + entry_index += 1; + break; + } + } + + loop { + let tag = match self.active_formatting.borrow()[entry_index] { + Element(_, ref t) => t.clone(), + Marker => panic!("Found marker during formatting element reconstruction"), + }; + + // FIXME: Is there a way to avoid cloning the attributes twice here (once on their own, + // once as part of t.clone() above)? + let new_element = + self.insert_element(Push, ns!(html), tag.name.clone(), tag.attrs.clone()); + self.active_formatting.borrow_mut()[entry_index] = Element(new_element, tag); + if entry_index == self.active_formatting.borrow().len() - 1 { + break; + } + entry_index += 1; + } + } + + /// Get the first element on the stack, which will be the element. + fn html_elem(&self) -> Ref { + Ref::map(self.open_elems.borrow(), |elems| &elems[0]) + } + + /// Get the second element on the stack, if it's a HTML body element. + fn body_elem(&self) -> Option> { + if self.open_elems.borrow().len() <= 1 { + return None; + } + + let node = Ref::map(self.open_elems.borrow(), |elems| &elems[1]); + if self.html_elem_named(&node, local_name!("body")) { + Some(node) + } else { + None + } + } + + /// Signal an error depending on the state of the stack of open elements at + /// the end of the body. + fn check_body_end(&self) { + declare_tag_set!(body_end_ok = + "dd" "dt" "li" "optgroup" "option" "p" "rp" "rt" "tbody" "td" "tfoot" "th" + "thead" "tr" "body" "html"); + + for elem in self.open_elems.borrow().iter() { + let error; + { + let elem_name = self.sink.elem_name(elem); + let name = elem_name.expanded(); + if body_end_ok(name) { + continue; + } + error = format_if!( + self.opts.exact_errors, + "Unexpected open tag at end of body", + "Unexpected open tag {:?} at end of body", + name + ); + } + self.sink.parse_error(error); + // FIXME: Do we keep checking after finding one bad tag? + // The spec suggests not. + return; + } + } + + fn in_scope(&self, scope: TagSet, pred: Pred) -> bool + where + TagSet: Fn(ExpandedName) -> bool, + Pred: Fn(Handle) -> bool, + { + for node in self.open_elems.borrow().iter().rev() { + if pred(node.clone()) { + return true; + } + if scope(self.sink.elem_name(node).expanded()) { + return false; + } + } + + // supposed to be impossible, because is always in scope + + false + } + + fn elem_in(&self, elem: &Handle, set: TagSet) -> bool + where + TagSet: Fn(ExpandedName) -> bool, + { + set(self.sink.elem_name(elem).expanded()) + } + + fn html_elem_named(&self, elem: &Handle, name: LocalName) -> bool { + let elem_name = self.sink.elem_name(elem); + *elem_name.ns() == ns!(html) && *elem_name.local_name() == name + } + + fn in_html_elem_named(&self, name: LocalName) -> bool { + self.open_elems + .borrow() + .iter() + .any(|elem| self.html_elem_named(elem, name.clone())) + } + + fn current_node_named(&self, name: LocalName) -> bool { + self.html_elem_named(&self.current_node(), name) + } + + fn in_scope_named(&self, scope: TagSet, name: LocalName) -> bool + where + TagSet: Fn(ExpandedName) -> bool, + { + self.in_scope(scope, |elem| self.html_elem_named(&elem, name.clone())) + } + + //§ closing-elements-that-have-implied-end-tags + fn generate_implied_end(&self, set: TagSet) + where + TagSet: Fn(ExpandedName) -> bool, + { + loop { + { + let open_elems = self.open_elems.borrow(); + let elem = unwrap_or_return!(open_elems.last()); + let elem_name = self.sink.elem_name(elem); + if !set(elem_name.expanded()) { + return; + } + } + self.pop(); + } + } + + fn generate_implied_end_except(&self, except: LocalName) { + self.generate_implied_end(|p| { + if *p.ns == ns!(html) && *p.local == except { + false + } else { + cursory_implied_end(p) + } + }); + } + //§ END + + // Pop elements until the current element is in the set. + fn pop_until_current(&self, tag_set: TagSet) + where + TagSet: Fn(ExpandedName) -> bool, + { + while !self.current_node_in(&tag_set) { + self.open_elems.borrow_mut().pop(); + } + } + + // Pop elements until an element from the set has been popped. Returns the + // number of elements popped. + fn pop_until

(&self, pred: P) -> usize + where + P: Fn(ExpandedName) -> bool, + { + let mut n = 0; + loop { + n += 1; + match self.open_elems.borrow_mut().pop() { + None => break, + Some(elem) => { + if pred(self.sink.elem_name(&elem).expanded()) { + break; + } + }, + } + } + n + } + + fn pop_until_named(&self, name: LocalName) -> usize { + self.pop_until(|p| *p.ns == ns!(html) && *p.local == name) + } + + // Pop elements until one with the specified name has been popped. + // Signal an error if it was not the first one. + fn expect_to_close(&self, name: LocalName) { + if self.pop_until_named(name.clone()) != 1 { + self.sink.parse_error(format_if!( + self.opts.exact_errors, + "Unexpected open element", + "Unexpected open element while closing {:?}", + name + )); + } + } + + fn close_p_element(&self) { + declare_tag_set!(implied = [cursory_implied_end] - "p"); + self.generate_implied_end(implied); + self.expect_to_close(local_name!("p")); + } + + fn close_p_element_in_button_scope(&self) { + if self.in_scope_named(button_scope, local_name!("p")) { + self.close_p_element(); + } + } + + // Check tags for type=hidden + fn is_type_hidden(&self, tag: &Tag) -> bool { + match tag + .attrs + .iter() + .find(|&at| at.name.expanded() == expanded_name!("", "type")) + { + None => false, + Some(at) => at.value.eq_ignore_ascii_case("hidden"), + } + } + + fn foster_parent_in_body(&self, token: Token) -> ProcessResult { + warn!("foster parenting not implemented"); + self.foster_parenting.set(true); + let res = self.step(InBody, token); + // FIXME: what if res is Reprocess? + self.foster_parenting.set(false); + res + } + + fn process_chars_in_table(&self, token: Token) -> ProcessResult { + declare_tag_set!(table_outer = "table" "tbody" "tfoot" "thead" "tr"); + if self.current_node_in(table_outer) { + assert!(self.pending_table_text.borrow().is_empty()); + self.orig_mode.set(Some(self.mode.get())); + Reprocess(InTableText, token) + } else { + self.sink.parse_error(format_if!( + self.opts.exact_errors, + "Unexpected characters in table", + "Unexpected characters {} in table", + to_escaped_string(&token) + )); + self.foster_parent_in_body(token) + } + } + + // https://html.spec.whatwg.org/multipage/#reset-the-insertion-mode-appropriately + fn reset_insertion_mode(&self) -> InsertionMode { + let open_elems = self.open_elems.borrow(); + for (i, mut node) in open_elems.iter().enumerate().rev() { + let last = i == 0usize; + let context_elem = self.context_elem.borrow(); + if let (true, Some(ctx)) = (last, context_elem.as_ref()) { + node = ctx; + } + let elem_name = self.sink.elem_name(node); + let name = match elem_name.expanded() { + ExpandedName { + ns: &ns!(html), + local, + } => local, + _ => continue, + }; + match *name { + local_name!("select") => { + for ancestor in self.open_elems.borrow()[0..i].iter().rev() { + if self.html_elem_named(ancestor, local_name!("template")) { + return InSelect; + } else if self.html_elem_named(ancestor, local_name!("table")) { + return InSelectInTable; + } + } + return InSelect; + }, + local_name!("td") | local_name!("th") => { + if !last { + return InCell; + } + }, + local_name!("tr") => return InRow, + local_name!("tbody") | local_name!("thead") | local_name!("tfoot") => { + return InTableBody; + }, + local_name!("caption") => return InCaption, + local_name!("colgroup") => return InColumnGroup, + local_name!("table") => return InTable, + local_name!("template") => return *self.template_modes.borrow().last().unwrap(), + local_name!("head") => { + if !last { + return InHead; + } + }, + local_name!("body") => return InBody, + local_name!("frameset") => return InFrameset, + local_name!("html") => match *self.head_elem.borrow() { + None => return BeforeHead, + Some(_) => return AfterHead, + }, + + _ => (), + } + } + InBody + } + + fn close_the_cell(&self) { + self.generate_implied_end(cursory_implied_end); + if self.pop_until(td_th) != 1 { + self.sink + .parse_error(Borrowed("expected to close or with cell")); + } + self.clear_active_formatting_to_marker(); + } + + fn append_text(&self, text: StrTendril) -> ProcessResult { + self.insert_appropriately(AppendText(text), None); + Done + } + + fn append_comment(&self, text: StrTendril) -> ProcessResult { + let comment = self.sink.create_comment(text); + self.insert_appropriately(AppendNode(comment), None); + Done + } + + fn append_comment_to_doc(&self, text: StrTendril) -> ProcessResult { + let comment = self.sink.create_comment(text); + self.sink.append(&self.doc_handle, AppendNode(comment)); + Done + } + + fn append_comment_to_html(&self, text: StrTendril) -> ProcessResult { + let open_elems = self.open_elems.borrow(); + let target = html_elem(&open_elems); + let comment = self.sink.create_comment(text); + self.sink.append(target, AppendNode(comment)); + Done + } + + //§ creating-and-inserting-nodes + fn create_root(&self, attrs: Vec) { + let elem = create_element( + &self.sink, + QualName::new(None, ns!(html), local_name!("html")), + attrs, + ); + self.push(&elem); + self.sink.append(&self.doc_handle, AppendNode(elem)); + // FIXME: application cache selection algorithm + } + + // https://html.spec.whatwg.org/multipage/#create-an-element-for-the-token + fn insert_element( + &self, + push: PushFlag, + ns: Namespace, + name: LocalName, + attrs: Vec, + ) -> Handle { + declare_tag_set!(form_associatable = + "button" "fieldset" "input" "object" + "output" "select" "textarea" "img"); + + declare_tag_set!(listed = [form_associatable] - "img"); + + // Step 7. + let qname = QualName::new(None, ns, name); + let elem = create_element(&self.sink, qname.clone(), attrs.clone()); + + let insertion_point = self.appropriate_place_for_insertion(None); + let (node1, node2) = match insertion_point { + LastChild(ref p) | BeforeSibling(ref p) => (p.clone(), None), + TableFosterParenting { + ref element, + ref prev_element, + } => (element.clone(), Some(prev_element.clone())), + }; + + // Step 12. + if form_associatable(qname.expanded()) + && self.form_elem.borrow().is_some() + && !self.in_html_elem_named(local_name!("template")) + && !(listed(qname.expanded()) + && attrs + .iter() + .any(|a| a.name.expanded() == expanded_name!("", "form"))) + { + let form = self.form_elem.borrow().as_ref().unwrap().clone(); + self.sink + .associate_with_form(&elem, &form, (&node1, node2.as_ref())); + } + + self.insert_at(insertion_point, AppendNode(elem.clone())); + + match push { + Push => self.push(&elem), + NoPush => (), + } + // FIXME: Remove from the stack if we can't append? + elem + } + + fn insert_element_for(&self, tag: Tag) -> Handle { + self.insert_element(Push, ns!(html), tag.name, tag.attrs) + } + + fn insert_and_pop_element_for(&self, tag: Tag) -> Handle { + self.insert_element(NoPush, ns!(html), tag.name, tag.attrs) + } + + fn insert_phantom(&self, name: LocalName) -> Handle { + self.insert_element(Push, ns!(html), name, vec![]) + } + + /// + fn insert_foreign_element( + &self, + tag: Tag, + ns: Namespace, + only_add_to_element_stack: bool, + ) -> Handle { + let adjusted_insertion_location = self.appropriate_place_for_insertion(None); + let qname = QualName::new(None, ns, tag.name); + let elem = create_element(&self.sink, qname.clone(), tag.attrs.clone()); + + if !only_add_to_element_stack { + self.insert_at(adjusted_insertion_location, AppendNode(elem.clone())); + } + + self.push(&elem); + + elem + } + //§ END + + /// + /// + /// A start tag whose tag name is "template" + fn should_attach_declarative_shadow(&self, tag: &Tag) -> bool { + let adjusted_insertion_location = self.appropriate_place_for_insertion(None); + + let (intended_parent, _node2) = match adjusted_insertion_location { + LastChild(ref p) | BeforeSibling(ref p) => (p.clone(), None), + TableFosterParenting { + ref element, + ref prev_element, + } => (element.clone(), Some(prev_element.clone())), + }; + + // template start tag's shadowrootmode is not in the none state + let is_shadow_root_mode = tag.attrs.iter().any(|attr| { + attr.name.local == local_name!("shadowrootmode") + && (attr.value.as_ref() == "open" || attr.value.as_ref() == "closed") + }); + + // Check if intended_parent's document allows declarative shadow roots + let allow_declarative_shadow_roots = + self.sink.allow_declarative_shadow_roots(&intended_parent); + + // the adjusted current node is not the topmost element in the stack of open elements + let adjusted_current_node_not_topmost = match self.open_elems.borrow().first() { + // The stack grows downwards; the topmost node on the stack is the first one added to the stack + // The current node is the bottommost node in this stack of open elements. + // + // (1) The adjusted current node is the context element if the parser was created as part of the HTML fragment parsing algorithm + // and the stack of open elements has only one element in it (fragment case); + // (2) otherwise, the adjusted current node is the current node (the bottomost node) + // + // => adjusted current node != topmost element in the stack when the stack size > 1 + Some(_) => self.open_elems.borrow().len() > 1, + None => true, + }; + + is_shadow_root_mode && allow_declarative_shadow_roots && adjusted_current_node_not_topmost + } + + /// + /// + /// A start tag whose tag name is "template" + fn attach_declarative_shadow( + &self, + tag: &Tag, + shadow_host: &Handle, + template: &Handle, + ) -> Result<(), String> { + self.sink + .attach_declarative_shadow(shadow_host, template, tag.attrs.clone()) + } + + fn create_formatting_element_for(&self, tag: Tag) -> Handle { + // FIXME: This really wants unit tests. + let mut first_match = None; + let mut matches = 0usize; + for (i, _, old_tag) in self.active_formatting_end_to_marker().iter() { + if tag.equiv_modulo_attr_order(old_tag) { + first_match = Some(i); + matches += 1; + } + } + + if matches >= 3 { + self.active_formatting + .borrow_mut() + .remove(first_match.expect("matches with no index")); + } + + let elem = self.insert_element(Push, ns!(html), tag.name.clone(), tag.attrs.clone()); + self.active_formatting + .borrow_mut() + .push(Element(elem.clone(), tag)); + elem + } + + fn clear_active_formatting_to_marker(&self) { + loop { + match self.active_formatting.borrow_mut().pop() { + None | Some(Marker) => break, + _ => (), + } + } + } + + fn process_end_tag_in_body(&self, tag: Tag) { + // Look back for a matching open element. + let mut match_idx = None; + for (i, elem) in self.open_elems.borrow().iter().enumerate().rev() { + if self.html_elem_named(elem, tag.name.clone()) { + match_idx = Some(i); + break; + } + + if self.elem_in(elem, special_tag) { + self.sink + .parse_error(Borrowed("Found special tag while closing generic tag")); + return; + } + } + + // Can't use unwrap_or_return!() due to rust-lang/rust#16617. + let match_idx = match match_idx { + None => { + // I believe this is impossible, because the root + // element is in special_tag. + self.unexpected(&tag); + return; + }, + Some(x) => x, + }; + + self.generate_implied_end_except(tag.name.clone()); + + if match_idx != self.open_elems.borrow().len() - 1 { + // mis-nested tags + self.unexpected(&tag); + } + self.open_elems.borrow_mut().truncate(match_idx); + } + + fn handle_misnested_a_tags(&self, tag: &Tag) { + let node = unwrap_or_return!(self + .active_formatting_end_to_marker() + .iter() + .find(|&(_, n, _)| self.html_elem_named(n, local_name!("a"))) + .map(|(_, n, _)| n.clone())); + + self.unexpected(tag); + self.adoption_agency(local_name!("a")); + self.position_in_active_formatting(&node) + .map(|index| self.active_formatting.borrow_mut().remove(index)); + self.remove_from_stack(&node); + } + + //§ tree-construction + fn is_foreign(&self, token: &Token) -> bool { + if let EOFToken = *token { + return false; + } + + if self.open_elems.borrow().is_empty() { + return false; + } + + let current = self.adjusted_current_node(); + let elem_name = self.sink.elem_name(¤t); + let name = elem_name.expanded(); + if let ns!(html) = *name.ns { + return false; + } + + if mathml_text_integration_point(name) { + match *token { + CharacterTokens(..) | NullCharacterToken => return false, + TagToken(Tag { + kind: StartTag, + ref name, + .. + }) if !matches!(*name, local_name!("mglyph") | local_name!("malignmark")) => { + return false; + }, + _ => (), + } + } + + if svg_html_integration_point(name) { + match *token { + CharacterTokens(..) | NullCharacterToken => return false, + TagToken(Tag { kind: StartTag, .. }) => return false, + _ => (), + } + } + + if let expanded_name!(mathml "annotation-xml") = name { + match *token { + TagToken(Tag { + kind: StartTag, + name: local_name!("svg"), + .. + }) => return false, + CharacterTokens(..) | NullCharacterToken | TagToken(Tag { kind: StartTag, .. }) => { + return !self + .sink + .is_mathml_annotation_xml_integration_point(&self.adjusted_current_node()); + }, + _ => {}, + }; + } + + true + } + //§ END + + fn enter_foreign(&self, mut tag: Tag, ns: Namespace) -> ProcessResult { + match ns { + ns!(mathml) => self.adjust_mathml_attributes(&mut tag), + ns!(svg) => self.adjust_svg_attributes(&mut tag), + _ => (), + } + self.adjust_foreign_attributes(&mut tag); + + if tag.self_closing { + self.insert_element(NoPush, ns, tag.name, tag.attrs); + DoneAckSelfClosing + } else { + self.insert_element(Push, ns, tag.name, tag.attrs); + Done + } + } + + fn adjust_svg_tag_name(&self, tag: &mut Tag) { + let Tag { ref mut name, .. } = *tag; + match *name { + local_name!("altglyph") => *name = local_name!("altGlyph"), + local_name!("altglyphdef") => *name = local_name!("altGlyphDef"), + local_name!("altglyphitem") => *name = local_name!("altGlyphItem"), + local_name!("animatecolor") => *name = local_name!("animateColor"), + local_name!("animatemotion") => *name = local_name!("animateMotion"), + local_name!("animatetransform") => *name = local_name!("animateTransform"), + local_name!("clippath") => *name = local_name!("clipPath"), + local_name!("feblend") => *name = local_name!("feBlend"), + local_name!("fecolormatrix") => *name = local_name!("feColorMatrix"), + local_name!("fecomponenttransfer") => *name = local_name!("feComponentTransfer"), + local_name!("fecomposite") => *name = local_name!("feComposite"), + local_name!("feconvolvematrix") => *name = local_name!("feConvolveMatrix"), + local_name!("fediffuselighting") => *name = local_name!("feDiffuseLighting"), + local_name!("fedisplacementmap") => *name = local_name!("feDisplacementMap"), + local_name!("fedistantlight") => *name = local_name!("feDistantLight"), + local_name!("fedropshadow") => *name = local_name!("feDropShadow"), + local_name!("feflood") => *name = local_name!("feFlood"), + local_name!("fefunca") => *name = local_name!("feFuncA"), + local_name!("fefuncb") => *name = local_name!("feFuncB"), + local_name!("fefuncg") => *name = local_name!("feFuncG"), + local_name!("fefuncr") => *name = local_name!("feFuncR"), + local_name!("fegaussianblur") => *name = local_name!("feGaussianBlur"), + local_name!("feimage") => *name = local_name!("feImage"), + local_name!("femerge") => *name = local_name!("feMerge"), + local_name!("femergenode") => *name = local_name!("feMergeNode"), + local_name!("femorphology") => *name = local_name!("feMorphology"), + local_name!("feoffset") => *name = local_name!("feOffset"), + local_name!("fepointlight") => *name = local_name!("fePointLight"), + local_name!("fespecularlighting") => *name = local_name!("feSpecularLighting"), + local_name!("fespotlight") => *name = local_name!("feSpotLight"), + local_name!("fetile") => *name = local_name!("feTile"), + local_name!("feturbulence") => *name = local_name!("feTurbulence"), + local_name!("foreignobject") => *name = local_name!("foreignObject"), + local_name!("glyphref") => *name = local_name!("glyphRef"), + local_name!("lineargradient") => *name = local_name!("linearGradient"), + local_name!("radialgradient") => *name = local_name!("radialGradient"), + local_name!("textpath") => *name = local_name!("textPath"), + _ => (), + } + } + + fn adjust_attributes(&self, tag: &mut Tag, mut map: F) + where + F: FnMut(LocalName) -> Option, + { + for &mut Attribute { ref mut name, .. } in &mut tag.attrs { + if let Some(replacement) = map(name.local.clone()) { + *name = replacement; + } + } + } + + fn adjust_svg_attributes(&self, tag: &mut Tag) { + self.adjust_attributes(tag, |k| match k { + local_name!("attributename") => Some(qualname!("", "attributeName")), + local_name!("attributetype") => Some(qualname!("", "attributeType")), + local_name!("basefrequency") => Some(qualname!("", "baseFrequency")), + local_name!("baseprofile") => Some(qualname!("", "baseProfile")), + local_name!("calcmode") => Some(qualname!("", "calcMode")), + local_name!("clippathunits") => Some(qualname!("", "clipPathUnits")), + local_name!("diffuseconstant") => Some(qualname!("", "diffuseConstant")), + local_name!("edgemode") => Some(qualname!("", "edgeMode")), + local_name!("filterunits") => Some(qualname!("", "filterUnits")), + local_name!("glyphref") => Some(qualname!("", "glyphRef")), + local_name!("gradienttransform") => Some(qualname!("", "gradientTransform")), + local_name!("gradientunits") => Some(qualname!("", "gradientUnits")), + local_name!("kernelmatrix") => Some(qualname!("", "kernelMatrix")), + local_name!("kernelunitlength") => Some(qualname!("", "kernelUnitLength")), + local_name!("keypoints") => Some(qualname!("", "keyPoints")), + local_name!("keysplines") => Some(qualname!("", "keySplines")), + local_name!("keytimes") => Some(qualname!("", "keyTimes")), + local_name!("lengthadjust") => Some(qualname!("", "lengthAdjust")), + local_name!("limitingconeangle") => Some(qualname!("", "limitingConeAngle")), + local_name!("markerheight") => Some(qualname!("", "markerHeight")), + local_name!("markerunits") => Some(qualname!("", "markerUnits")), + local_name!("markerwidth") => Some(qualname!("", "markerWidth")), + local_name!("maskcontentunits") => Some(qualname!("", "maskContentUnits")), + local_name!("maskunits") => Some(qualname!("", "maskUnits")), + local_name!("numoctaves") => Some(qualname!("", "numOctaves")), + local_name!("pathlength") => Some(qualname!("", "pathLength")), + local_name!("patterncontentunits") => Some(qualname!("", "patternContentUnits")), + local_name!("patterntransform") => Some(qualname!("", "patternTransform")), + local_name!("patternunits") => Some(qualname!("", "patternUnits")), + local_name!("pointsatx") => Some(qualname!("", "pointsAtX")), + local_name!("pointsaty") => Some(qualname!("", "pointsAtY")), + local_name!("pointsatz") => Some(qualname!("", "pointsAtZ")), + local_name!("preservealpha") => Some(qualname!("", "preserveAlpha")), + local_name!("preserveaspectratio") => Some(qualname!("", "preserveAspectRatio")), + local_name!("primitiveunits") => Some(qualname!("", "primitiveUnits")), + local_name!("refx") => Some(qualname!("", "refX")), + local_name!("refy") => Some(qualname!("", "refY")), + local_name!("repeatcount") => Some(qualname!("", "repeatCount")), + local_name!("repeatdur") => Some(qualname!("", "repeatDur")), + local_name!("requiredextensions") => Some(qualname!("", "requiredExtensions")), + local_name!("requiredfeatures") => Some(qualname!("", "requiredFeatures")), + local_name!("specularconstant") => Some(qualname!("", "specularConstant")), + local_name!("specularexponent") => Some(qualname!("", "specularExponent")), + local_name!("spreadmethod") => Some(qualname!("", "spreadMethod")), + local_name!("startoffset") => Some(qualname!("", "startOffset")), + local_name!("stddeviation") => Some(qualname!("", "stdDeviation")), + local_name!("stitchtiles") => Some(qualname!("", "stitchTiles")), + local_name!("surfacescale") => Some(qualname!("", "surfaceScale")), + local_name!("systemlanguage") => Some(qualname!("", "systemLanguage")), + local_name!("tablevalues") => Some(qualname!("", "tableValues")), + local_name!("targetx") => Some(qualname!("", "targetX")), + local_name!("targety") => Some(qualname!("", "targetY")), + local_name!("textlength") => Some(qualname!("", "textLength")), + local_name!("viewbox") => Some(qualname!("", "viewBox")), + local_name!("viewtarget") => Some(qualname!("", "viewTarget")), + local_name!("xchannelselector") => Some(qualname!("", "xChannelSelector")), + local_name!("ychannelselector") => Some(qualname!("", "yChannelSelector")), + local_name!("zoomandpan") => Some(qualname!("", "zoomAndPan")), + _ => None, + }); + } + + fn adjust_mathml_attributes(&self, tag: &mut Tag) { + self.adjust_attributes(tag, |k| match k { + local_name!("definitionurl") => Some(qualname!("", "definitionURL")), + _ => None, + }); + } + + fn adjust_foreign_attributes(&self, tag: &mut Tag) { + self.adjust_attributes(tag, |k| match k { + local_name!("xlink:actuate") => Some(qualname!("xlink" xlink "actuate")), + local_name!("xlink:arcrole") => Some(qualname!("xlink" xlink "arcrole")), + local_name!("xlink:href") => Some(qualname!("xlink" xlink "href")), + local_name!("xlink:role") => Some(qualname!("xlink" xlink "role")), + local_name!("xlink:show") => Some(qualname!("xlink" xlink "show")), + local_name!("xlink:title") => Some(qualname!("xlink" xlink "title")), + local_name!("xlink:type") => Some(qualname!("xlink" xlink "type")), + local_name!("xml:lang") => Some(qualname!("xml" xml "lang")), + local_name!("xml:space") => Some(qualname!("xml" xml "space")), + local_name!("xmlns") => Some(qualname!("" xmlns "xmlns")), + local_name!("xmlns:xlink") => Some(qualname!("xmlns" xmlns "xlink")), + _ => None, + }); + } + + fn foreign_start_tag(&self, mut tag: Tag) -> ProcessResult { + let current_ns = self + .sink + .elem_name(&self.adjusted_current_node()) + .ns() + .clone(); + match current_ns { + ns!(mathml) => self.adjust_mathml_attributes(&mut tag), + ns!(svg) => { + self.adjust_svg_tag_name(&mut tag); + self.adjust_svg_attributes(&mut tag); + }, + _ => (), + } + self.adjust_foreign_attributes(&mut tag); + if tag.self_closing { + // FIXME(#118): in SVG + + tag @ => { + let mut first = true; + let mut stack_idx = self.open_elems.borrow().len() - 1; + loop { + if stack_idx == 0 { + return Done; + } + + let html; + let eq; + { + let open_elems = self.open_elems.borrow(); + let node_name = self.sink.elem_name(&open_elems[stack_idx]); + html = *node_name.ns() == ns!(html); + eq = node_name.local_name().eq_ignore_ascii_case(&tag.name); + } + if !first && html { + let mode = self.mode.get(); + return self.step(mode, TagToken(tag)); + } + + if eq { + self.open_elems.borrow_mut().truncate(stack_idx); + return Done; + } + + if first { + self.unexpected(&tag); + first = false; + } + stack_idx -= 1; + } + } + + // FIXME: This should be unreachable, but match_token requires a + // catch-all case. + _ => panic!("impossible case in foreign content"), + }) + } +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/tag_sets.rs b/collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/tag_sets.rs new file mode 100644 index 000000000..0d587973e --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/tag_sets.rs @@ -0,0 +1,114 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Various sets of HTML tag names, and macros for declaring them. + +use crate::ExpandedName; +use markup5ever::{expanded_name, local_name, namespace_url, ns}; + +macro_rules! declare_tag_set_impl ( ($param:ident, $b:ident, $supr:ident, $($tag:tt)+) => ( + match $param { + $( expanded_name!(html $tag) => $b, )+ + p => $supr(p), + } +)); + +macro_rules! declare_tag_set_body ( + ($param:ident = [$supr:ident] - $($tag:tt)+) + => ( declare_tag_set_impl!($param, false, $supr, $($tag)+) ); + + ($param:ident = [$supr:ident] + $($tag:tt)+) + => ( declare_tag_set_impl!($param, true, $supr, $($tag)+) ); + + ($param:ident = $($tag:tt)+) + => ( declare_tag_set_impl!($param, true, empty_set, $($tag)+) ); +); + +macro_rules! declare_tag_set ( + (pub $name:ident = $($toks:tt)+) => ( + pub(crate) fn $name(p: crate::ExpandedName) -> bool { + declare_tag_set_body!(p = $($toks)+) + } + ); + + ($name:ident = $($toks:tt)+) => ( + fn $name(p: crate::ExpandedName) -> bool { + declare_tag_set_body!(p = $($toks)+) + } + ); +); + +#[inline(always)] +pub(crate) fn empty_set(_: ExpandedName) -> bool { + false +} +#[inline(always)] +pub(crate) fn full_set(_: ExpandedName) -> bool { + true +} + +declare_tag_set!(pub html_default_scope = + "applet" "caption" "html" "table" "td" "th" "marquee" "object" "template"); + +#[inline(always)] +pub(crate) fn default_scope(name: ExpandedName) -> bool { + html_default_scope(name) + || mathml_text_integration_point(name) + || svg_html_integration_point(name) +} + +declare_tag_set!(pub list_item_scope = [default_scope] + "ol" "ul"); +declare_tag_set!(pub button_scope = [default_scope] + "button"); +declare_tag_set!(pub table_scope = "html" "table" "template"); +declare_tag_set!(pub select_scope = [full_set] - "optgroup" "option"); + +declare_tag_set!(pub table_body_context = "tbody" "tfoot" "thead" "template" "html"); +declare_tag_set!(pub table_row_context = "tr" "template" "html"); +declare_tag_set!(pub td_th = "td" "th"); + +declare_tag_set!(pub cursory_implied_end = + "dd" "dt" "li" "option" "optgroup" "p" "rb" "rp" "rt" "rtc"); + +declare_tag_set!(pub thorough_implied_end = [cursory_implied_end] + + "caption" "colgroup" "tbody" "td" "tfoot" "th" "thead" "tr"); + +declare_tag_set!(pub heading_tag = "h1" "h2" "h3" "h4" "h5" "h6"); + +declare_tag_set!(pub special_tag = + "address" "applet" "area" "article" "aside" "base" "basefont" "bgsound" "blockquote" "body" + "br" "button" "caption" "center" "col" "colgroup" "dd" "details" "dir" "div" "dl" "dt" "embed" + "fieldset" "figcaption" "figure" "footer" "form" "frame" "frameset" "h1" "h2" "h3" "h4" "h5" + "h6" "head" "header" "hgroup" "hr" "html" "iframe" "img" "input" "isindex" "li" "link" + "listing" "main" "marquee" "menu" "meta" "nav" "noembed" "noframes" "noscript" + "object" "ol" "p" "param" "plaintext" "pre" "script" "section" "select" "source" "style" + "summary" "table" "tbody" "td" "template" "textarea" "tfoot" "th" "thead" "title" "tr" "track" + "ul" "wbr" "xmp"); +//§ END + +pub(crate) fn mathml_text_integration_point(p: ExpandedName) -> bool { + matches!( + p, + expanded_name!(mathml "mi") + | expanded_name!(mathml "mo") + | expanded_name!(mathml "mn") + | expanded_name!(mathml "ms") + | expanded_name!(mathml "mtext") + ) +} + +/// https://html.spec.whatwg.org/multipage/#html-integration-point +pub(crate) fn svg_html_integration_point(p: ExpandedName) -> bool { + // annotation-xml are handle in another place + matches!( + p, + expanded_name!(svg "foreignObject") + | expanded_name!(svg "desc") + | expanded_name!(svg "title") + ) +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/types.rs b/collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/types.rs new file mode 100644 index 000000000..f6bb588d0 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/src/tree_builder/types.rs @@ -0,0 +1,98 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Types used within the tree builder code. Not exported to users. + +use crate::tokenizer::states::RawKind; +use crate::tokenizer::Tag; + +use crate::tendril::StrTendril; + +pub(crate) use self::FormatEntry::*; +pub(crate) use self::InsertionMode::*; +pub(crate) use self::InsertionPoint::*; +pub(crate) use self::ProcessResult::*; +pub(crate) use self::SplitStatus::*; +pub(crate) use self::Token::*; + +#[derive(PartialEq, Eq, Copy, Clone, Debug)] +pub(crate) enum InsertionMode { + Initial, + BeforeHtml, + BeforeHead, + InHead, + InHeadNoscript, + AfterHead, + InBody, + Text, + InTable, + InTableText, + InCaption, + InColumnGroup, + InTableBody, + InRow, + InCell, + InSelect, + InSelectInTable, + InTemplate, + AfterBody, + InFrameset, + AfterFrameset, + AfterAfterBody, + AfterAfterFrameset, +} + +#[derive(PartialEq, Eq, Copy, Clone, Debug)] +pub(crate) enum SplitStatus { + NotSplit, + Whitespace, + NotWhitespace, +} + +/// A subset/refinement of `tokenizer::Token`. Everything else is handled +/// specially at the beginning of `process_token`. +#[derive(PartialEq, Eq, Clone, Debug)] +#[allow(clippy::enum_variant_names)] +pub(crate) enum Token { + TagToken(Tag), + CommentToken(StrTendril), + CharacterTokens(SplitStatus, StrTendril), + NullCharacterToken, + EOFToken, +} + +pub(crate) enum ProcessResult { + Done, + DoneAckSelfClosing, + SplitWhitespace(StrTendril), + Reprocess(InsertionMode, Token), + #[allow(dead_code)] // FIXME + ReprocessForeign(Token), + Script(Handle), + ToPlaintext, + ToRawData(RawKind), +} + +pub(crate) enum FormatEntry { + Element(Handle, Tag), + Marker, +} + +pub(crate) enum InsertionPoint { + /// Insert as last child in this parent. + LastChild(Handle), + #[allow(dead_code)] // FIXME + /// Insert before this following sibling. + BeforeSibling(Handle), + /// Insertion point is decided based on existence of element's parent node. + TableFosterParenting { + element: Handle, + prev_element: Handle, + }, +} diff --git a/collector/compile-benchmarks/html5ever-0.31.0/src/util/str.rs b/collector/compile-benchmarks/html5ever-0.31.0/src/util/str.rs new file mode 100644 index 000000000..2c0ec3e2e --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/src/util/str.rs @@ -0,0 +1,42 @@ +// Copyright 2014-2017 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::fmt; + +pub(crate) fn to_escaped_string(x: &T) -> String { + // FIXME: don't allocate twice + let string = format!("{x:?}"); + string.chars().flat_map(|c| c.escape_default()).collect() +} + +/// If `c` is an ASCII letter, return the corresponding lowercase +/// letter, otherwise None. +pub(crate) fn lower_ascii_letter(c: char) -> Option { + match c { + 'a'..='z' => Some(c), + 'A'..='Z' => Some((c as u8 - b'A' + b'a') as char), + _ => None, + } +} + +#[cfg(test)] +#[allow(non_snake_case)] +mod test { + use super::lower_ascii_letter; + use mac::test_eq; + + test_eq!(lower_letter_a_is_a, lower_ascii_letter('a'), Some('a')); + test_eq!(lower_letter_A_is_a, lower_ascii_letter('A'), Some('a')); + test_eq!(lower_letter_symbol_is_None, lower_ascii_letter('!'), None); + test_eq!( + lower_letter_nonascii_is_None, + lower_ascii_letter('\u{a66e}'), + None + ); +} From b42fa75c8ea4f9e3112cea19e3e7f5719115edda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Ber=C3=A1nek?= Date: Mon, 14 Apr 2025 14:30:47 +0200 Subject: [PATCH 2/2] Add benchmark files and update documentation --- collector/compile-benchmarks/README.md | 1 + collector/compile-benchmarks/REUSE.toml | 5 +++++ .../html5ever-0.31.0/0-println.patch | 12 ++++++++++++ .../compile-benchmarks/html5ever-0.31.0/Cargo.toml | 2 ++ .../html5ever-0.31.0/perf-config.json | 5 +++-- 5 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 collector/compile-benchmarks/html5ever-0.31.0/0-println.patch diff --git a/collector/compile-benchmarks/README.md b/collector/compile-benchmarks/README.md index 21051db03..cf73dad24 100644 --- a/collector/compile-benchmarks/README.md +++ b/collector/compile-benchmarks/README.md @@ -32,6 +32,7 @@ They mostly consist of real-world crates. crate. - **helloworld**: A trivial program. Gives a lower bound on compile time. - **html5ever-0.26.0**: An HTML parser. Stresses macro parsing code. +- **html5ever-0.31.0**: An HTML parser. Stresses macro parsing code. - **hyper-0.14.18**: A fairly large crate. Utilizes async/await, and used by many Rust programs. The crate uses cargo features to enable large portions of its structure and is built with `--features=client,http1,http2,server,stream`. diff --git a/collector/compile-benchmarks/REUSE.toml b/collector/compile-benchmarks/REUSE.toml index a1e1a312b..87e5d2cec 100644 --- a/collector/compile-benchmarks/REUSE.toml +++ b/collector/compile-benchmarks/REUSE.toml @@ -110,6 +110,11 @@ path = "html5ever-0.26.0/**" SPDX-FileCopyrightText = "The html5ever Project Developers" SPDX-License-Identifier = "MIT OR Apache-2.0" +[[annotations]] +path = "html5ever-0.31.0/**" +SPDX-FileCopyrightText = "The html5ever Project Developers" +SPDX-License-Identifier = "MIT OR Apache-2.0" + [[annotations]] path = "hyper-0.14.18/**" SPDX-FileCopyrightText = "hyper contributors" diff --git a/collector/compile-benchmarks/html5ever-0.31.0/0-println.patch b/collector/compile-benchmarks/html5ever-0.31.0/0-println.patch new file mode 100644 index 000000000..6e7a03328 --- /dev/null +++ b/collector/compile-benchmarks/html5ever-0.31.0/0-println.patch @@ -0,0 +1,12 @@ +diff --git a/src/util/str.rs b/src/util/str.rs +index 2c0ec3e2..a78ff669 100644 +--- a/src/util/str.rs ++++ b/src/util/str.rs +@@ -10,6 +10,7 @@ + use std::fmt; + + pub(crate) fn to_escaped_string(x: &T) -> String { ++ println!("testing"); + // FIXME: don't allocate twice + let string = format!("{x:?}"); + string.chars().flat_map(|c| c.escape_default()).collect() diff --git a/collector/compile-benchmarks/html5ever-0.31.0/Cargo.toml b/collector/compile-benchmarks/html5ever-0.31.0/Cargo.toml index 016e26afe..b9d33d28e 100644 --- a/collector/compile-benchmarks/html5ever-0.31.0/Cargo.toml +++ b/collector/compile-benchmarks/html5ever-0.31.0/Cargo.toml @@ -86,3 +86,5 @@ version = "2.0.2" [features] trace_tokenizer = [] + +[workspace] diff --git a/collector/compile-benchmarks/html5ever-0.31.0/perf-config.json b/collector/compile-benchmarks/html5ever-0.31.0/perf-config.json index 710581fa8..23539ac6b 100644 --- a/collector/compile-benchmarks/html5ever-0.31.0/perf-config.json +++ b/collector/compile-benchmarks/html5ever-0.31.0/perf-config.json @@ -1,4 +1,5 @@ { - "artifact": "library", - "category": "primary" + "touch_file": "src/lib.rs", + "category": "primary", + "artifact": "library" }