Skip to content

Commit

Permalink
FasterTokenizer->FastTokenizer (PaddlePaddle#3719)
Browse files Browse the repository at this point in the history
  • Loading branch information
joey12300 authored Nov 10, 2022
1 parent d37fd7f commit bbf0c39
Show file tree
Hide file tree
Showing 173 changed files with 782 additions and 755 deletions.
4 changes: 2 additions & 2 deletions README_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ PaddleNLP้’ˆๅฏนไฟกๆฏๆŠฝๅ–ใ€่ฏญไน‰ๆฃ€็ดขใ€ๆ™บ่ƒฝ้—ฎ็ญ”ใ€ๆƒ…ๆ„Ÿๅˆ†ๆž็ญ‰้ซ˜

### ้ซ˜ๆ€ง่ƒฝๅˆ†ๅธƒๅผ่ฎญ็ปƒไธŽๆŽจ็†

#### โšก FasterTokenizer๏ผš้ซ˜ๆ€ง่ƒฝๆ–‡ๆœฌๅค„็†ๅบ“
#### โšก FastTokenizer๏ผš้ซ˜ๆ€ง่ƒฝๆ–‡ๆœฌๅค„็†ๅบ“

<div align="center">
<img src="https://user-images.githubusercontent.com/11793384/168407921-b4395b1d-44bd-41a0-8c58-923ba2b703ef.png" width="400">
Expand All @@ -244,7 +244,7 @@ PaddleNLP้’ˆๅฏนไฟกๆฏๆŠฝๅ–ใ€่ฏญไน‰ๆฃ€็ดขใ€ๆ™บ่ƒฝ้—ฎ็ญ”ใ€ๆƒ…ๆ„Ÿๅˆ†ๆž็ญ‰้ซ˜
AutoTokenizer.from_pretrained("ernie-3.0-medium-zh", use_faster=True)
```

ไธบไบ†ๅฎž็Žฐๆ›ดๆž่‡ด็š„ๆจกๅž‹้ƒจ็ฝฒๆ€ง่ƒฝ๏ผŒๅฎ‰่ฃ…FastTokenizersๅŽๅช้œ€ๅœจ`AutoTokenizer` APIไธŠๆ‰“ๅผ€ `use_faster=True`้€‰้กน๏ผŒๅณๅฏ่ฐƒ็”จC++ๅฎž็Žฐ็š„้ซ˜ๆ€ง่ƒฝๅˆ†่ฏ็ฎ—ๅญ๏ผŒ่ฝปๆพ่Žทๅพ—่ถ…Python็™พไฝ™ๅ€็š„ๆ–‡ๆœฌๅค„็†ๅŠ ้€Ÿ๏ผŒๆ›ดๅคšไฝฟ็”จ่ฏดๆ˜Žๅฏๅ‚่€ƒ[FasterTokenizerๆ–‡ๆกฃ](./faster_tokenizer)ใ€‚
ไธบไบ†ๅฎž็Žฐๆ›ดๆž่‡ด็š„ๆจกๅž‹้ƒจ็ฝฒๆ€ง่ƒฝ๏ผŒๅฎ‰่ฃ…FastTokenizersๅŽๅช้œ€ๅœจ`AutoTokenizer` APIไธŠๆ‰“ๅผ€ `use_faster=True`้€‰้กน๏ผŒๅณๅฏ่ฐƒ็”จC++ๅฎž็Žฐ็š„้ซ˜ๆ€ง่ƒฝๅˆ†่ฏ็ฎ—ๅญ๏ผŒ่ฝปๆพ่Žทๅพ—่ถ…Python็™พไฝ™ๅ€็š„ๆ–‡ๆœฌๅค„็†ๅŠ ้€Ÿ๏ผŒๆ›ดๅคšไฝฟ็”จ่ฏดๆ˜Žๅฏๅ‚่€ƒ[FastTokenizerๆ–‡ๆกฃ](./fast_tokenizer)ใ€‚

#### โšก๏ธ FasterGeneration๏ผš้ซ˜ๆ€ง่ƒฝ็”ŸๆˆๅŠ ้€Ÿๅบ“

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ cmake_minimum_required(VERSION 3.10)

project(tokenizers LANGUAGES CXX C VERSION 1.0)

option(WITH_TESTING "Compile PaddleNLP faster_tokenizer with unit testing" OFF)
option(WITH_PYTHON "Compile PaddleNLP faster_tokenizer with python interpreter" ON)
option(WITH_TESTING "Compile PaddleNLP fast_tokenizer with unit testing" OFF)
option(WITH_PYTHON "Compile PaddleNLP fast_tokenizer with python interpreter" ON)
add_definitions(-DFASTERTOKENIZER_LIB)

set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
Expand Down Expand Up @@ -137,7 +137,7 @@ include_directories(${TOKENIZERS_INSTALL_INCLUDE_DIR})
include(generic)
include(third_party)

add_subdirectory(faster_tokenizer)
add_subdirectory(fast_tokenizer)

if(WITH_PYTHON)

Expand All @@ -155,18 +155,18 @@ add_custom_target(build_tokenizers_bdist_wheel ALL
DEPENDS copy_python_tokenizers)
endif()

else(WITH_PYTHON) # Pack faster_tokenizer cpp lib
else(WITH_PYTHON) # Pack fast_tokenizer cpp lib

set(CPP_PACKAGE_DIR ${CMAKE_BINARY_DIR}/cpp/faster_tokenizer)
set(CPP_PACKAGE_DIR ${CMAKE_BINARY_DIR}/cpp/fast_tokenizer)
add_custom_target(build_cpp_package_dir ALL
COMMAND ${CMAKE_COMMAND} -E make_directory ${CPP_PACKAGE_DIR}/lib ${CPP_PACKAGE_DIR}/include ${CPP_PACKAGE_DIR}/third_party/include ${CPP_PACKAGE_DIR}/third_party/lib
DEPENDS core_tokenizers)

# copy cmake
file(COPY ${PROJECT_SOURCE_DIR}/FasterTokenizer.cmake DESTINATION ${CPP_PACKAGE_DIR}/)
file(COPY ${PROJECT_SOURCE_DIR}/FastTokenizer.cmake DESTINATION ${CPP_PACKAGE_DIR}/)

# copy headers
file(COPY ${PROJECT_SOURCE_DIR}/faster_tokenizer/ DESTINATION ${CPP_PACKAGE_DIR}/include/faster_tokenizer/
file(COPY ${PROJECT_SOURCE_DIR}/fast_tokenizer/ DESTINATION ${CPP_PACKAGE_DIR}/include/fast_tokenizer/
FILES_MATCHING PATTERN "*.h"
PATTERN "test" EXCLUDE
PATTERN "demo" EXCLUDE
Expand All @@ -181,7 +181,7 @@ add_custom_target(copy_third_party_headers ALL

# copy library
set(TOKENIZER_CORE_NAME "core_tokenizers")
set(TOKENIZER_CORE_PATH ${CMAKE_BINARY_DIR}/faster_tokenizer)
set(TOKENIZER_CORE_PATH ${CMAKE_BINARY_DIR}/fast_tokenizer)
if (WIN32)
set(ICU_DLL_DIR ${CMAKE_BINARY_DIR}/third_party/icu/src/extern_icu/icu4c/bin64)
set(ICU_LIB_DIR ${CMAKE_BINARY_DIR}/third_party/icu/src/extern_icu/icu4c/lib64)
Expand Down
File renamed without changes.
File renamed without changes.
105 changes: 105 additions & 0 deletions fast_tokenizer/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# FastTokenizer

------------------------------------------------------------------------------------------

<p align="center">
<a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
<a href="https://github.com/PaddlePaddle/PaddleNLP/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/PaddleNLP?color=ffa"></a>
<a href=""><img src="https://img.shields.io/badge/python-3.6.2+-aff.svg"></a>
<a href=""><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg"></a>
<a href="https://github.com/PaddlePaddle/PaddleNLP/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/PaddleNLP?color=9ea"></a>
<a href="https://github.com/PaddlePaddle/PaddleNLP/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/PaddleNLP?color=3af"></a>
<a href="https://pypi.org/project/paddlenlp/"><img src="https://img.shields.io/pypi/dm/paddlenlp?color=9cf"></a>
<a href="https://github.com/PaddlePaddle/PaddleNLP/issues"><img src="https://img.shields.io/github/issues/PaddlePaddle/PaddleNLP?color=9cc"></a>
<a href="https://github.com/PaddlePaddle/PaddleNLP/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/PaddleNLP?color=ccf"></a>
</p>
FastTokenizerๆ˜ฏไธ€ๆฌพ็ฎ€ๅ•ๆ˜“็”จใ€ๅŠŸ่ƒฝๅผบๅคง็š„่ทจๅนณๅฐ้ซ˜ๆ€ง่ƒฝๆ–‡ๆœฌ้ข„ๅค„็†ๅบ“๏ผŒ้›†ๆˆไธš็•Œๅคšไธชๅธธ็”จ็š„Tokenizerๅฎž็Žฐ๏ผŒๆ”ฏๆŒไธๅŒNLPๅœบๆ™ฏไธ‹็š„ๆ–‡ๆœฌ้ข„ๅค„็†ๅŠŸ่ƒฝ๏ผŒๅฆ‚ๆ–‡ๆœฌๅˆ†็ฑปใ€้˜…่ฏป็†่งฃ๏ผŒๅบๅˆ—ๆ ‡ๆณจ็ญ‰ใ€‚็ป“ๅˆPaddleNLP Tokenizerๆจกๅ—๏ผŒไธบ็”จๆˆทๅœจ่ฎญ็ปƒใ€ๆŽจ็†้˜ถๆฎตๆไพ›้ซ˜ๆ•ˆ้€š็”จ็š„ๆ–‡ๆœฌ้ข„ๅค„็†่ƒฝๅŠ›ใ€‚

## ็‰นๆ€ง

- ้ซ˜ๆ€ง่ƒฝใ€‚็”ฑไบŽๅบ•ๅฑ‚้‡‡็”จC++ๅฎž็Žฐ๏ผŒๆ‰€ไปฅๅ…ถๆ€ง่ƒฝ่ฟœ้ซ˜ไบŽ็›ฎๅ‰ๅธธ่ง„Pythonๅฎž็Žฐ็š„Tokenizerใ€‚ๅœจๆ–‡ๆœฌๅˆ†็ฑปไปปๅŠกไธŠ๏ผŒFastTokenizerๅฏนๆฏ”Python็‰ˆๆœฌTokenizerๅŠ ้€Ÿๆฏ”ๆœ€้ซ˜ๅฏ่พพ20ๅ€ใ€‚
- ่ทจๅนณๅฐใ€‚FastTokenizerๅฏๅœจไธๅŒ็š„็ณป็ปŸๅนณๅฐไธŠไฝฟ็”จ๏ผŒ็›ฎๅ‰ๅทฒๆ”ฏๆŒWindows x64๏ผŒLinux x64ไปฅๅŠMacOS 10.14+ๅนณๅฐไธŠไฝฟ็”จใ€‚
- ๅคš็ผ–็จ‹่ฏญ่จ€ๆ”ฏๆŒใ€‚FastTokenizerๆไพ›ๅœจC++ใ€Python่ฏญ่จ€ไธŠๅผ€ๅ‘็š„่ƒฝๅŠ›ใ€‚
- ็ตๆดปๆ€งๅผบใ€‚็”จๆˆทๅฏไปฅ้€š่ฟ‡ๆŒ‡ๅฎšไธๅŒ็š„FastTokenizer็ป„ไปถๅฎšๅˆถๆปก่ถณ้œ€ๆฑ‚็š„Tokenizerใ€‚

## ๅฟซ้€Ÿๅผ€ๅง‹

ไธ‹้ขๅฐ†ไป‹็ปPython็‰ˆๆœฌFastTokenizer็š„ไฝฟ็”จๆ–นๅผ๏ผŒC++็‰ˆๆœฌ็š„ไฝฟ็”จๆ–นๅผๅฏๅ‚่€ƒ[FastTokenizer C++ Demo](./fast_tokenizer/demo/README.md)ใ€‚

### ๅ‰็ฝฎไพ่ต–

- Windows 64ไฝ็ณป็ปŸ
- Linux x64็ณป็ปŸ
- MacOS 10.14+็ณป็ปŸ๏ผˆm1่Šฏ็‰‡็š„MacOS๏ผŒ้œ€่ฆไฝฟ็”จx86_64็‰ˆๆœฌ็š„Anacondaไฝœไธบpython็Žฏๅขƒๆ–นๅฏๅฎ‰่ฃ…ไฝฟ็”จ๏ผ‰
- Python 3.6 ~ 3.9

### ๅฎ‰่ฃ…FastTokenizer

```python
pip install fast_tokenizer
```

### FastTokenizerไฝฟ็”จ็คบไพ‹

- ๅ‡†ๅค‡่ฏ่กจ

```shell
# Linuxๆˆ–่€…Mac็”จๆˆทๅฏ็›ดๆŽฅๆ‰ง่กŒไปฅไธ‹ๅ‘ฝไปคไธ‹่ฝฝๆต‹่ฏ•็š„่ฏ่กจ๏ผŒWindows ็”จๆˆทๅฏๅœจๆต่งˆๅ™จไธŠไธ‹่ฝฝๅˆฐๆœฌๅœฐใ€‚
wget https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt
```

- ๅˆ‡่ฏ็คบไพ‹

FastTokenizerๅบ“ๅ†…็ฝฎNLPไปปๅŠกๅธธ็”จ็š„Tokenizer๏ผŒๅฆ‚ErnieFastTokenizerใ€‚ไธ‹้ขๅฐ†ๅฑ•็คบFastTokenizer็š„็ฎ€ๅ•็”จๆณ•ใ€‚

```python
from fast_tokenizer import ErnieFastTokenizer, models
# 1. ๅŠ ่ฝฝ่ฏ่กจ
vocab = models.WordPiece.read_file("ernie_vocab.txt")
# 2. ๅฎžไพ‹ๅŒ–ErnieFastTokenizerๅฏน่ฑก
fast_tokenizer = ErnieFastTokenizer(vocab)
# 3. ๅˆ‡่ฏ
output = fast_tokenizer.encode("ๆˆ‘็ˆฑไธญๅ›ฝ")
# 4. ่พ“ๅ‡บ็ป“ๆžœ
print("ids: ", output.ids)
print("type_ids: ", output.type_ids)
print("tokens: ", output.tokens)
print("offsets: ", output.offsets)
print("attention_mask: ", output.attention_mask)
```

### FastTokenizerๅœจPaddleNLP Tokenizerๆจกๅ—ๅŠ ้€Ÿ็คบไพ‹

PaddleNLP Tokenizerๆจกๅ—ๅฏ็ฎ€ๅ•ๅœฐๅบ”็”จๅœจๆจกๅž‹่ฎญ็ปƒไปฅๅŠๆŽจ็†้ƒจ็ฝฒ็š„ๆ–‡ๆœฌ้ข„ๅค„็†้˜ถๆฎต๏ผŒๅนถ้€š่ฟ‡`AutoTokenizer.from_pretrained`ๆ–นๅผๅฎžไพ‹ๅŒ–็›ธๅบ”็š„Tokenizerใ€‚ๅ…ถไธญ`AutoTokenizer`้ป˜่ฎคๅŠ ่ฝฝๅพ—ๅˆฐ็š„Tokenizerๆ˜ฏๅธธ่ง„Pythonๅฎž็Žฐ็š„Tokenizer๏ผŒๅ…ถๆ€ง่ƒฝไผšไฝŽไบŽC++ๅฎž็Žฐ็š„FastTokenizerใ€‚ไธบไบ†ๆๅ‡PaddleNLP Tokenizerๆจกๅ—ๆ€ง่ƒฝ๏ผŒ็›ฎๅ‰PaddleNLP Tokenizerๆจกๅ—ๅทฒ็ปๆ”ฏๆŒไฝฟ็”จFastTokenizerไฝœไธบTokenizer็š„ๅŽ็ซฏๅŠ ้€Ÿๅˆ‡่ฏ้˜ถๆฎตใ€‚ๅœจ็Žฐๆœ‰็š„TokenizerๅŠ ่ฝฝๆŽฅๅฃไธญ๏ผŒไป…้œ€ๆทปๅŠ `use_fast=True`่ฟ™ไธ€ๅ…ณ้”ฎ่ฏๅ‚ๆ•ฐ๏ผŒๅ…ถไฝ™ไปฃ็ ไฟๆŒไธๅ˜๏ผŒๅณๅฏๅŠ ่ฝฝFast็‰ˆๆœฌ็š„Tokenizer๏ผŒไปฃ็ ็คบไพ‹ๅฆ‚ไธ‹๏ผš

```python
from paddlenlp.transformers import AutoTokenizer

# ้ป˜่ฎคๅŠ ่ฝฝPython็‰ˆๆœฌ็š„Tokenizer
tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh')
# ๆ‰“ๅผ€use_fastๅผ€ๅ…ณ๏ผŒๅฏๅŠ ่ฝฝFast็‰ˆๆœฌTokenizer
fast_tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh', use_fast=True)

text1 = tokenizer('่‡ช็„ถ่ฏญ่จ€ๅค„็†')
text2 = fast_tokenizer('่‡ช็„ถ่ฏญ่จ€ๅค„็†')

print(text1)
print(text2)
```

็›ฎๅ‰PaddleNLPๅทฒๆ”ฏๆŒBERTใ€ERNIEใ€TinyBERTไปฅๅŠERNIE-M 4็งTokenizer็š„Fast็‰ˆๆœฌ๏ผŒๅ…ถไฝ™ๆจกๅž‹็š„Tokenizerๆš‚ไธๆ”ฏๆŒFast็‰ˆๆœฌใ€‚

## FAQ

Q๏ผšๆˆ‘ๅœจAutoTokenizer.from_pretrainedๆŽฅๅฃไธŠๅทฒ็ปๆ‰“ๅผ€`use_fast=True`ๅผ€ๅ…ณ๏ผŒไธบไป€ไนˆๆ–‡ๆœฌ้ข„ๅค„็†้˜ถๆฎตๆ€ง่ƒฝไธŠๅฅฝๅƒๆฒกๆœ‰ไปปไฝ•ๅ˜ๅŒ–๏ผŸ

A๏ผšๅœจๆœ‰ไธ‰็งๆƒ…ๅ†ตไธ‹๏ผŒๆ‰“ๅผ€`use_fast=True`ๅผ€ๅ…ณๅฏ่ƒฝๆ— ๆณ•ๆๅ‡ๆ€ง่ƒฝ๏ผš
1. ๆฒกๆœ‰ๅฎ‰่ฃ…fast_tokenizerใ€‚่‹ฅๅœจๆฒกๆœ‰ๅฎ‰่ฃ…fast_tokenizerๅบ“็š„ๆƒ…ๅ†ตไธ‹ๆ‰“ๅผ€`use_fast`ๅผ€ๅ…ณ๏ผŒPaddleNLPไผš็ป™ๅ‡บไปฅไธ‹warning๏ผš"Can't find the fast_tokenizer package, please ensure install fast_tokenizer correctly. "ใ€‚

2. ๅŠ ่ฝฝ็š„Tokenizer็ฑปๅž‹ๆš‚ไธๆ”ฏๆŒFast็‰ˆๆœฌใ€‚็›ฎๅ‰ๆ”ฏๆŒ4็งTokenizer็š„Fast็‰ˆๆœฌ๏ผŒๅˆ†ๅˆซๆ˜ฏBERTใ€ERNIEใ€TinyBERTไปฅๅŠERNIE-M Tokenizerใ€‚่‹ฅๅŠ ่ฝฝไธๆ”ฏๆŒFast็‰ˆๆœฌ็š„Tokenizerๆƒ…ๅ†ตไธ‹ๆ‰“ๅผ€`use_fast`ๅผ€ๅ…ณ๏ผŒPaddleNLPไผš็ป™ๅ‡บไปฅไธ‹warning๏ผš"The tokenizer XXX doesn't have the fast version. Please check the map paddlenlp.transformers.auto.tokenizer.FASTER_TOKENIZER_MAPPING_NAMES to see which fast tokenizers are currently supported."

3. ๅพ…ๅˆ‡่ฏๆ–‡ๆœฌ้•ฟๅบฆ่ฟ‡็Ÿญ๏ผˆๅฆ‚ๆ–‡ๆœฌๅนณๅ‡้•ฟๅบฆๅฐไบŽ5๏ผ‰ใ€‚่ฟ™็งๆƒ…ๅ†ตไธ‹ๅˆ‡่ฏๅผ€้”€ๅฏ่ƒฝไธๆ˜ฏๆ•ดไธชๆ–‡ๆœฌ้ข„ๅค„็†็š„ๆ€ง่ƒฝ็“ถ้ขˆ๏ผŒๅฏผ่‡ดๅœจไฝฟ็”จFastTokenizerๅŽไปๆ— ๆณ•ๆๅ‡ๆ•ดไฝ“ๆ€ง่ƒฝใ€‚

## ็›ธๅ…ณๆ–‡ๆกฃ

[FastTokenizer็ผ–่ฏ‘ๆŒ‡ๅ—](docs/compile/README.md)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# FasterTokenizer็ผ–่ฏ‘ๆŒ‡ๅ—
# FastTokenizer็ผ–่ฏ‘ๆŒ‡ๅ—

ๆœฌๆ–‡ๆกฃ่ฏดๆ˜Ž็ผ–่ฏ‘FasterTokenizer C++ๅบ“ใ€Pythonๅบ“ไธค็ง็ผ–่ฏ‘่ฟ‡็จ‹๏ผŒๆ นๆฎ็ผ–่ฏ‘็š„ๅนณๅฐๅ‚่€ƒๅฆ‚ไธ‹ๆ–‡ๆกฃ
ๆœฌๆ–‡ๆกฃ่ฏดๆ˜Ž็ผ–่ฏ‘FastTokenizer C++ๅบ“ใ€Pythonๅบ“ไธค็ง็ผ–่ฏ‘่ฟ‡็จ‹๏ผŒๆ นๆฎ็ผ–่ฏ‘็š„ๅนณๅฐๅ‚่€ƒๅฆ‚ไธ‹ๆ–‡ๆกฃ

- [Linux & Mac ็ผ–่ฏ‘](./how_to_build_linux_and_mac.md)
- [Windows็ผ–่ฏ‘](./how_to_build_windows.md)

FasterTokenizerไฝฟ็”จCMake็ผ–่ฏ‘๏ผŒๅ…ถไธญ็ผ–่ฏ‘่ฟ‡็จ‹ไธญ๏ผŒๅ„ๅนณๅฐไธŠ็ผ–่ฏ‘้€‰้กนๅฆ‚ไธ‹่กจๆ‰€็คบ
FastTokenizerไฝฟ็”จCMake็ผ–่ฏ‘๏ผŒๅ…ถไธญ็ผ–่ฏ‘่ฟ‡็จ‹ไธญ๏ผŒๅ„ๅนณๅฐไธŠ็ผ–่ฏ‘้€‰้กนๅฆ‚ไธ‹่กจๆ‰€็คบ

| ้€‰้กน | ไฝœ็”จ | ๅค‡ๆณจ |
|:---- | :--- | :--- |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

```bash
git clone https://github.com/PaddlePaddle/PaddleNLP.git
cd PaddleNLP/faster_tokenizer
cd PaddleNLP/fast_tokenizer
mkdir build & cd build
cmake .. -DWITH_PYTHON=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
make -j8
Expand All @@ -21,7 +21,7 @@ make -j8

```bash
git clone https://github.com/PaddlePaddle/PaddleNLP.git
cd PaddleNLP/faster_tokenizer
cd PaddleNLP/fast_tokenizer
mkdir build & cd build
# ่ฎพ็ฝฎPython็Žฏๅขƒ
export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

```bash
git clone https://github.com/PaddlePaddle/PaddleNLP.git
cd PaddleNLP/faster_tokenizer
cd PaddleNLP/fast_tokenizer
mkdir build & cd build
cmake .. -G "Ninja" -DWITH_PYTHON=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
ninja -j8
Expand All @@ -25,7 +25,7 @@ ninja -j8

```bash
git clone https://github.com/PaddlePaddle/PaddleNLP.git
cd PaddleNLP/faster_tokenizer
cd PaddleNLP/fast_tokenizer
mkdir build & cd build
# ้œ€่ฆๆŒ‡ๅฎšPythonๅบ“
cmake .. -G "Ninja" -DWITH_PYTHON=ON ^
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ endif()
if (WITH_PYTHON)
add_subdirectory(pybind)
cc_library(core_tokenizers SHARED
SRCS pybind/pybind.cc tokenizers/ernie_faster_tokenizer.cc
SRCS pybind/pybind.cc tokenizers/ernie_fast_tokenizer.cc
DEPS pybind python pybind_normalizers pybind_utils
pybind_pretokenizers pybind_models pybind_decoders
pybind_postprocessors pybind_tokenizers pybind_exception
Expand All @@ -33,7 +33,7 @@ endif()
else(WITH_PYTHON)
# add_subdirectory(tokenizers)
cc_library(core_tokenizers SHARED
SRCS tokenizers/ernie_faster_tokenizer.cc
SRCS tokenizers/ernie_fast_tokenizer.cc
DEPS normalizers pretokenizers models decoders
postprocessors core added_vocabulary tokenizer json)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "faster_tokenizer/core/added_vocabulary.h"
#include "faster_tokenizer/models/model.h"
#include "faster_tokenizer/normalizers/normalizer.h"
#include "faster_tokenizer/pretokenizers/pretokenizer.h"
#include "fast_tokenizer/core/added_vocabulary.h"
#include "fast_tokenizer/models/model.h"
#include "fast_tokenizer/normalizers/normalizer.h"
#include "fast_tokenizer/pretokenizers/pretokenizer.h"
#include "glog/logging.h"
#include "re2/re2.h"

namespace paddlenlp {
namespace faster_tokenizer {
namespace fast_tokenizer {
namespace core {

inline bool StartWithWord(const std::string& sequence) {
Expand Down Expand Up @@ -420,5 +420,5 @@ void to_json(nlohmann::json& j, const AddedVocabulary& added_vocab) {
}

} // namespace core
} // namespace faster_tokenizer
} // namespace fast_tokenizer
} // namespace paddlenlp
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ limitations under the License. */
#include <string>
#include <unordered_set>

#include "faster_tokenizer/core/base.h"
#include "fast_tokenizer/core/base.h"
#include "nlohmann/json.hpp"

namespace re2 {
class RE2;
} // namespace re2

namespace paddlenlp {
namespace faster_tokenizer {
namespace fast_tokenizer {

namespace normalizers {
class Normalizer;
Expand Down Expand Up @@ -139,15 +139,15 @@ class FASTERTOKENIZER_DECL AddedVocabulary {
};

} // namespace core
} // namespace faster_tokenizer
} // namespace fast_tokenizer
} // namespace paddlenlp

namespace std {
template <>
class hash<paddlenlp::faster_tokenizer::core::AddedToken> {
class hash<paddlenlp::fast_tokenizer::core::AddedToken> {
public:
size_t operator()(
const paddlenlp::faster_tokenizer::core::AddedToken& added_token) const {
const paddlenlp::fast_tokenizer::core::AddedToken& added_token) const {
return std::hash<std::string>()(added_token.GetContent());
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ limitations under the License. */
#include <vector>

#include "nlohmann/json.hpp"
#include "faster_tokenizer/utils/utils.h"
#include "fast_tokenizer/utils/utils.h"

namespace std {
template <>
Expand All @@ -36,7 +36,7 @@ struct hash<std::pair<uint32_t, uint32_t>> {
}

namespace paddlenlp {
namespace faster_tokenizer {
namespace fast_tokenizer {
namespace core {

enum FASTERTOKENIZER_DECL OffsetType { CHAR, BYTE };
Expand Down Expand Up @@ -359,5 +359,5 @@ struct FASTERTOKENIZER_DECL BPEWord {
};

} // namespace core
} // namespace faster_tokenizer
} // namespace fast_tokenizer
} // namespace paddlenlp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "faster_tokenizer/core/encoding.h"
#include "fast_tokenizer/core/encoding.h"
#include <algorithm>
#include <cassert>
#include <climits>
Expand All @@ -24,7 +24,7 @@ limitations under the License. */
#endif

namespace paddlenlp {
namespace faster_tokenizer {
namespace fast_tokenizer {
namespace core {

Encoding::Encoding(const std::vector<uint32_t>& ids,
Expand Down Expand Up @@ -693,5 +693,5 @@ void RunMultiThread(std::function<void(size_t, size_t)> func,
}

} // namespace core
} // namespace faster_tokenizer
} // namespace fast_tokenizer
} // namespace paddlenlp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ limitations under the License. */
#include <string>
#include <unordered_map>
#include <vector>
#include "faster_tokenizer/core/base.h"
#include "faster_tokenizer/utils/utils.h"
#include "fast_tokenizer/core/base.h"
#include "fast_tokenizer/utils/utils.h"

#include <math.h>
#include <stdlib.h>
Expand All @@ -28,7 +28,7 @@ limitations under the License. */
using namespace std;

namespace paddlenlp {
namespace faster_tokenizer {
namespace fast_tokenizer {
namespace core {

class FASTERTOKENIZER_DECL Encoding {
Expand Down Expand Up @@ -133,5 +133,5 @@ int FASTERTOKENIZER_DECL GetThreadNum(size_t batch_size);
void FASTERTOKENIZER_DECL
RunMultiThread(std::function<void(size_t, size_t)> func, size_t batch_size);
} // namespace core
} // namespace faster_tokenizer
} // namespace fast_tokenizer
} // namespace paddlenlp
Loading

0 comments on commit bbf0c39

Please sign in to comment.