hasktorch · tscholak · May 20, 2021 · May 20, 2021 · May 20, 2021 · May 20, 2021
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/bindings/node/CHANGELOG.md b/bindings/node/CHANGELOG.md
@@ -1,3 +1,24 @@
+# [0.8.0](https://github.com/huggingface/tokenizers/compare/node-v0.7.0...node-v0.8.0) (2021-09-02)
+
+### BREACKING CHANGES
+- Many improvements on the Trainer ([#519](https://github.com/huggingface/tokenizers/pull/519)).
+The files must now be provided first when calling `tokenizer.train(files, trainer)`.
+
+### Features
+- Adding the `TemplateProcessing`
+- Add `WordLevel` and `Unigram` models ([#490](https://github.com/huggingface/tokenizers/pull/490))
+- Add `nmtNormalizer` and `precompiledNormalizer` normalizers ([#490](https://github.com/huggingface/tokenizers/pull/490))
+- Add `templateProcessing` post-processor ([#490](https://github.com/huggingface/tokenizers/pull/490))
+- Add `digitsPreTokenizer` pre-tokenizer ([#490](https://github.com/huggingface/tokenizers/pull/490))
+- Add support for mapping to sequences ([#506](https://github.com/huggingface/tokenizers/pull/506))
+- Add `splitPreTokenizer` pre-tokenizer ([#542](https://github.com/huggingface/tokenizers/pull/542))
+- Add `behavior` option to the `punctuationPreTokenizer` ([#657](https://github.com/huggingface/tokenizers/pull/657))
+- Add the ability to load tokenizers from the Hugging Face Hub using `fromPretrained` ([#780](https://github.com/huggingface/tokenizers/pull/780))
+
+### Fixes
+- Fix a bug where long tokenizer.json files would be incorrectly deserialized ([#459](https://github.com/huggingface/tokenizers/pull/459))
+- Fix RobertaProcessing deserialization in PostProcessorWrapper ([#464](https://github.com/huggingface/tokenizers/pull/464))
+
 # [0.7.0](https://github.com/huggingface/tokenizers/compare/node-v0.6.2...node-v0.7.0) (2020-07-01)
 
 ### BREAKING CHANGES

diff --git a/bindings/node/Makefile b/bindings/node/Makefile
@@ -27,7 +27,7 @@ $(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
 
 $(DATA_DIR)/roberta.json :
 	$(dir_guard)
-	wget https://storage.googleapis.com/tokenizers/roberta.json -O $@
+	wget https://huggingface.co/roberta-large/raw/main/tokenizer.json -O $@
 
 $(DATA_DIR)/tokenizer-wiki.json :
 	$(dir_guard)

diff --git a/bindings/node/lib/bindings/decoders.d.ts b/bindings/node/lib/bindings/decoders.d.ts
@@ -36,3 +36,16 @@ export function metaspaceDecoder(replacement?: string, addPrefixSpace?: boolean)
  * This suffix will be replaced by whitespaces during the decoding
  */
 export function bpeDecoder(suffix?: string): Decoder;
+
+/**
+ * Instantiate a new CTC Decoder
+ * @param [pad_token='pad'] The pad token used by CTC to delimit a new token.
+ * @param [word_delimiter_token='|'] The word delimiter token. It will be replaced by a space
+ * @param [cleanup=true] Whether to cleanup some tokenization artifacts.
+ * Mainly spaces before punctuation, and some abbreviated english forms.
+ */
+export function ctcDecoder(
+  pad_token?: string,
+  word_delimiter_token?: string,
+  cleanup?: boolean
+): Decoder;
diff --git a/bindings/node/lib/bindings/decoders.js b/bindings/node/lib/bindings/decoders.js
@@ -5,4 +5,5 @@ module.exports = {
   wordPieceDecoder: native.decoders_WordPiece,
   metaspaceDecoder: native.decoders_Metaspace,
   bpeDecoder: native.decoders_BPEDecoder,
+  ctcDecoder: native.decoders_CTC,
 };
diff --git a/bindings/node/lib/bindings/decoders.test.ts b/bindings/node/lib/bindings/decoders.test.ts
@@ -1,4 +1,4 @@
-import { bpeDecoder, metaspaceDecoder, wordPieceDecoder } from "./decoders";
+import { bpeDecoder, ctcDecoder, metaspaceDecoder, wordPieceDecoder } from "./decoders";
 
 describe("wordPieceDecoder", () => {
   it("accepts `undefined` as first parameter", () => {
@@ -31,3 +31,14 @@ describe("bpeDecoder", () => {
     expect(bpeDecoder(undefined)).toBeDefined();
   });
 });
+
+describe("ctcDecoder", () => {
+  it("accepts `undefined` as parameter", () => {
+    expect(ctcDecoder(undefined)).toBeDefined();
+  });
+  it("encodes correctly", () => {
+    expect(
+      ctcDecoder().decode(["<pad>", "h", "h", "e", "e", "l", "l", "<pad>", "l", "l", "o"])
+    ).toEqual("hello");
+  });
+});
diff --git a/bindings/node/lib/bindings/pre-tokenizers.d.ts b/bindings/node/lib/bindings/pre-tokenizers.d.ts
@@ -90,10 +90,14 @@ export function charDelimiterSplitPreTokenizer(delimiter: string): PreTokenizer;
 
 /**
  * Returns a new Punctuation PreTokenizer.
- * This pre-tokenizer splits tokens on punctuation.
- * Each occurrence of a punctuation character will be treated separately.
+ * This pre-tokenizer splits tokens on punctuation according to the provided behavior.
+ * Each occurrence of a punctuation character is treated separately.
+ *
+ * @param [behavior="isolated"] The behavior to use when splitting.
+ * Choices: "removed", "isolated", "mergedWithPrevious", "mergedWithNext",
+ * "contiguous"
  */
-export function punctuationPreTokenizer(): PreTokenizer;
+export function punctuationPreTokenizer(behavior?: string): PreTokenizer;
 
 /**
  * Returns a new Sequence PreTokenizer.

diff --git a/bindings/node/lib/bindings/pre-tokenizers.test.ts b/bindings/node/lib/bindings/pre-tokenizers.test.ts
@@ -43,6 +43,11 @@ describe("punctuationPreTokenizer", () => {
     const processor = punctuationPreTokenizer();
     expect(processor.constructor.name).toEqual("PreTokenizer");
   });
+
+  it("instantiates correctly with non-default split delimeter", () => {
+    const processor = punctuationPreTokenizer("removed");
+    expect(processor.constructor.name).toEqual("PreTokenizer");
+  });
 });
 
 describe("splitPreTokenizer", () => {

diff --git a/bindings/node/lib/bindings/tokenizer.d.ts b/bindings/node/lib/bindings/tokenizer.d.ts
@@ -7,6 +7,19 @@ import { PreTokenizer } from "./pre-tokenizers";
 import { RawEncoding } from "./raw-encoding";
 import { Trainer } from "./trainers";
 
+export interface FromPretrainedOptions {
+  /**
+   * The revision to download
+   * @default "main"
+   */
+  revision?: string;
+  /**
+   * The auth token to use to access private repositories on the Hugging Face Hub
+   * @default undefined
+   */
+  authToken?: string;
+}
+
 export interface TruncationOptions {
   /**
    * The length of the previous sequence to be included in the overflowing sequence
@@ -123,6 +136,15 @@ export class Tokenizer {
    */
   static fromString(s: string): Tokenizer;
 
+  /**
+   * Instantiate a new Tokenizer from an existing file on the
+   * Hugging Face Hub. Any model repo containing a `tokenizer.json`
+   * can be used here.
+   * @param identifier A model identifier on the Hub
+   * @param options Additional options
+   */
+  static fromPretrained(s: string, options?: FromPretrainedOptions): Tokenizer;
+
   /**
    * Add the given tokens to the vocabulary
    *

diff --git a/bindings/node/lib/bindings/tokenizer.js b/bindings/node/lib/bindings/tokenizer.js
@@ -3,6 +3,7 @@ const native = require("./native");
 class Tokenizer extends native.tokenizer_Tokenizer {
   static fromString = native.tokenizer_Tokenizer_from_string;
   static fromFile = native.tokenizer_Tokenizer_from_file;
+  static fromPretrained = native.tokenizer_Tokenizer_from_pretrained;
 }
 
 module.exports = {

diff --git a/bindings/node/lib/bindings/tokenizer.test.ts b/bindings/node/lib/bindings/tokenizer.test.ts
@@ -64,6 +64,7 @@ describe("Tokenizer", () => {
 
     expect(typeof Tokenizer.fromFile).toBe("function");
     expect(typeof Tokenizer.fromString).toBe("function");
+    expect(typeof Tokenizer.fromPretrained).toBe("function");
 
     expect(typeof tokenizer.addSpecialTokens).toBe("function");
     expect(typeof tokenizer.addTokens).toBe("function");
@@ -94,6 +95,33 @@ describe("Tokenizer", () => {
     expect(typeof tokenizer.train).toBe("function");
   });
 
+  it("can be instantiated from the hub", async () => {
+    let tokenizer: Tokenizer;
+    let encode: (
+      sequence: InputSequence,
+      pair?: InputSequence | null,
+      options?: EncodeOptions | null
+    ) => Promise<RawEncoding>;
+    let output: RawEncoding;
+
+    tokenizer = Tokenizer.fromPretrained("bert-base-cased");
+    encode = promisify(tokenizer.encode.bind(tokenizer));
+    output = await encode("Hey there dear friend!", null, { addSpecialTokens: false });
+    expect(output.getTokens()).toEqual(["Hey", "there", "dear", "friend", "!"]);
+
+    tokenizer = Tokenizer.fromPretrained("anthony/tokenizers-test");
+    encode = promisify(tokenizer.encode.bind(tokenizer));
+    output = await encode("Hey there dear friend!", null, { addSpecialTokens: false });
+    expect(output.getTokens()).toEqual(["hey", "there", "dear", "friend", "!"]);
+
+    tokenizer = Tokenizer.fromPretrained("anthony/tokenizers-test", {
+      revision: "gpt-2",
+    });
+    encode = promisify(tokenizer.encode.bind(tokenizer));
+    output = await encode("Hey there dear friend!", null, { addSpecialTokens: false });
+    expect(output.getTokens()).toEqual(["Hey", "Ġthere", "Ġdear", "Ġfriend", "!"]);
+  });
+
   describe("addTokens", () => {
     it("accepts a list of string as new tokens when initial model is empty", () => {
       const model = BPE.empty();

diff --git a/bindings/node/lib/implementations/tokenizers/sentence-piece-bpe.tokenizer.ts b/bindings/node/lib/implementations/tokenizers/sentence-piece-bpe.tokenizer.ts
@@ -62,9 +62,7 @@ type SentencePieceBPETokenizerConfig = SentencePieceBPETokenizerOptions &
 /**
  * Represents the BPE algorithm, with the pretokenization used by SentencePiece
  */
-export class SentencePieceBPETokenizer extends BaseTokenizer<
-  SentencePieceBPETokenizerConfig
-> {
+export class SentencePieceBPETokenizer extends BaseTokenizer<SentencePieceBPETokenizerConfig> {
   private static readonly defaultOptions: SentencePieceBPETokenizerConfig = {
     addPrefixSpace: true,
     replacement: "▁",