Skip to content

Commit

Permalink
commit changes to modernize project with upstream changes
Browse files Browse the repository at this point in the history
  • Loading branch information
wbarnha committed Aug 29, 2024
1 parent 3f3b595 commit 27ea468
Show file tree
Hide file tree
Showing 131 changed files with 4,327 additions and 104 deletions.
39 changes: 29 additions & 10 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,15 @@ requires-python = ">= 3.9"
dynamic = ["version"]

[build-system]
requires = ["setuptools", "Cython", "wheel"]
requires = ["setuptools", "wheel", "cython", "pkgconfig", "setuptools_scm"]
build-backend = "setuptools.build_meta"

[project.scripts]
cchardetect = "cchardet.cli.cchardetect:main"

[tool.setuptools.dynamic]
version = { attr = "cchardet.__version__" }
[tool.setuptools_scm]
# enables setuptools_scm to provide the dynamic version


[tool.rye]
dev-dependencies = [
Expand All @@ -49,13 +50,31 @@ format.quote-style = "double"
format.indent-style = "space"

[tool.cibuildwheel]
skip = "pp* cp36-* cp37-* cp38-*"
archs = "auto"
build-frontend = "build"
skip = ["*-win32"]
archs = ["auto"]
test-requires = ['pytest']
test-command = [
'cd {project}',
'python -m pytest {project}'
]

environment = {INCLUDE_PATH="/usr/local/include/uchardet", LIBRARY_PATH="/usr/local/lib64/"}
before-build = [
"git submodule sync --recursive",
"git submodule update --init --force --recursive --depth=1",
]

[tool.cibuildwheel.macos]
environment = {INCLUDE_PATH="/usr/local/include/uchardet", LIBRARY_PATH="/usr/local/lib/"}
before-build = [
"pip install cython==3.0.10",
"cython {project}/src/cchardet/_cchardet.pyx",
"git submodule sync --recursive",
"git submodule update --init --force --recursive --depth=1",
]

# NOTICE: ローカルでは想定通りにテストは通るが、 cibuildwheel では 特定のファイルの文字コード検知がうまくいかないので一時的に無効化する
# test-requires = "pytest"
# test-command = "pytest -vs {project}/tests"
[tool.cibuildwheel.windows]
before-build = [
"git submodule sync --recursive",
"git submodule update --init --force --recursive --depth=1",
"make pip"
]
165 changes: 88 additions & 77 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,93 +1,104 @@
#!/usr/bin/env python
# coding: utf-8

import glob
import os
import codecs
import re
from setuptools.command.build_ext import build_ext

from setuptools import Extension, setup
try:
import sysconfig
except ImportError:
from distutils import sysconfig

cchardet_dir = "src/cchardet/"
uchardet_dir = "src/ext/uchardet/src"
cchardet_sources = glob.glob(cchardet_dir + "*.cpp")
sources = cchardet_sources
try:
from setuptools import setup, Extension
except ImportError:
from distutils.core import setup, Extension

from Cython.Build import cythonize


join = os.path.join

cchardet_dir = join("src", "cchardet") + os.path.sep
uchardet_dir = join("src", "ext", "uchardet", "src")
uchardet_lang_models_dir = join(uchardet_dir, "LangModels")

cchardet_sources = [join("src", "cchardet", "_cchardet.pyx")]
uchardet_sources = [
os.path.join(uchardet_dir, "LangModels/LangArabicModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangBelarusianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangBulgarianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangCatalanModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangCroatianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangCzechModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangDanishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangEnglishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangEsperantoModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangEstonianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangFinnishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangFrenchModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangGeorgianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangGermanModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangGreekModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangHebrewModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangHindiModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangHungarianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangIrishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangItalianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangLatvianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangLithuanianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangMacedonianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangMalteseModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangNorwegianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangPolishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangPortugueseModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangRomanianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangRussianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangSerbianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangSlovakModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangSloveneModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangSpanishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangSwedishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangThaiModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangTurkishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangUkrainianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangVietnameseModel.cpp"),
os.path.join(uchardet_dir, "CharDistribution.cpp"),
os.path.join(uchardet_dir, "JpCntx.cpp"),
os.path.join(uchardet_dir, "nsBig5Prober.cpp"),
os.path.join(uchardet_dir, "nsCharSetProber.cpp"),
os.path.join(uchardet_dir, "nsCJKDetector.cpp"),
os.path.join(uchardet_dir, "nsEscCharsetProber.cpp"),
os.path.join(uchardet_dir, "nsEscSM.cpp"),
os.path.join(uchardet_dir, "nsEUCJPProber.cpp"),
os.path.join(uchardet_dir, "nsEUCKRProber.cpp"),
os.path.join(uchardet_dir, "nsEUCTWProber.cpp"),
os.path.join(uchardet_dir, "nsGB2312Prober.cpp"),
os.path.join(uchardet_dir, "nsHebrewProber.cpp"),
os.path.join(uchardet_dir, "nsJohabProber.cpp"),
os.path.join(uchardet_dir, "nsLanguageDetector.cpp"),
os.path.join(uchardet_dir, "nsLatin1Prober.cpp"),
os.path.join(uchardet_dir, "nsMBCSGroupProber.cpp"),
os.path.join(uchardet_dir, "nsMBCSSM.cpp"),
os.path.join(uchardet_dir, "nsSBCharSetProber.cpp"),
os.path.join(uchardet_dir, "nsSBCSGroupProber.cpp"),
os.path.join(uchardet_dir, "nsSJISProber.cpp"),
os.path.join(uchardet_dir, "nsUniversalDetector.cpp"),
os.path.join(uchardet_dir, "nsUTF8Prober.cpp"),
os.path.join(uchardet_dir, "uchardet.cpp"),
join(uchardet_dir, file)
for file in os.listdir(uchardet_dir)
if file.endswith(".cpp")
]
sources += uchardet_sources
uchardet_lang_source = [
join(uchardet_lang_models_dir, file)
for file in os.listdir(uchardet_lang_models_dir)
if file.endswith(".cpp")
]
sources = cchardet_sources + uchardet_sources + uchardet_lang_source

ext_args = {
"include_dirs": uchardet_dir.split(os.pathsep),
"library_dirs": uchardet_dir.split(os.pathsep),
}


# Remove the "-Wstrict-prototypes" compiler option, which isn't valid for C++.
cfg_vars = sysconfig.get_config_vars()
for key, value in cfg_vars.items():
if type(value) == str:
cfg_vars[key] = value.replace("-Wstrict-prototypes", "")
# O3を指定したところで速度が向上するかは疑問である
# cfg_vars[key] = value.replace("-O2", "-O3")


cchardet_module = Extension("cchardet._cchardet", sources, language="c++", extra_compile_args=['-std=c++11'], **ext_args,)


def read(f):
return open(os.path.join(os.path.dirname(__file__), f)).read().strip()


setup(
name="faust-cchardet",
author="PyYoshi",
author_email="[email protected]",
url=r"https://github.com/faust-streaming/cChardet",
description="cChardet is high speed universal character encoding detector.",
long_description="\n\n".join((read("README.md"), read("CHANGES.md"))),
license="Mozilla Public License",
classifiers=[
"Development Status :: 6 - Mature",
"License :: OSI Approved :: Mozilla Public License 1.1 (MPL 1.1)",
"License :: OSI Approved :: GNU General Public License (GPL)",
"License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)",
"Programming Language :: Cython",
"Programming Language :: Python",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
"Topic :: Software Development :: Libraries",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
],
keywords=["cython", "chardet", "charsetdetect"],
cmdclass={"build_ext": build_ext},
package_dir={"": "src"},
packages=[
"cchardet",
],
ext_modules=[
Extension(
"cchardet._cchardet",
sources=sources,
include_dirs=[uchardet_dir],
language="c++",
extra_compile_args=['-std=c++11'],
)
],
scripts=["src/cchardet/cli/cchardetect.py"],
ext_modules=cythonize(
[
cchardet_module,
],
cplus=True,
compiler_directives={"language_level": "3"}, # Python 3
),
)
5 changes: 1 addition & 4 deletions src/cchardet/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
from . import _cchardet

version = (2, 2, 0, "alpha", 3)
__version__ = "2.2.0a3"
from cchardet import _cchardet


def detect(msg):
Expand Down
26 changes: 13 additions & 13 deletions src/tests/cchardet_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,23 @@
import sys

SKIP_LIST = [
os.path.join("src", "tests", "testdata", "ja", "utf-16le.txt"),
os.path.join("src", "tests", "testdata", "ja", "utf-16be.txt"),
os.path.join("src", "tests", "testdata", "es", "iso-8859-15.txt"),
os.path.join("src", "tests", "testdata", "da", "iso-8859-1.txt"),
os.path.join("src", "tests", "testdata", "he", "iso-8859-8.txt"),
os.path.join("tests", "testdata", "ja", "utf-16le.txt"),
os.path.join("tests", "testdata", "ja", "utf-16be.txt"),
os.path.join("tests", "testdata", "es", "iso-8859-15.txt"),
os.path.join("tests", "testdata", "da", "iso-8859-1.txt"),
os.path.join("tests", "testdata", "he", "iso-8859-8.txt"),
]

if sys.maxsize <= 2**32:
# Fails on i686 only, original cchardet test fails too
SKIP_LIST.append(os.path.join("src", "tests", "testdata", "th", "tis-620.txt"))
SKIP_LIST.append(os.path.join("src", "tests", "testdata", "fi", "iso-8859-1.txt"))
SKIP_LIST.append(os.path.join("src", "tests", "testdata", "ga", "iso-8859-1.txt"))
SKIP_LIST.append(os.path.join("tests", "testdata", "th", "tis-620.txt"))
SKIP_LIST.append(os.path.join("tests", "testdata", "fi", "iso-8859-1.txt"))
SKIP_LIST.append(os.path.join("tests", "testdata", "ga", "iso-8859-1.txt"))

# Python can't decode encoding
SKIP_LIST_02 = [
os.path.join("src", "tests", "testdata", "vi", "viscii.txt"),
os.path.join("src", "tests", "testdata", "zh", "euc-tw.txt"),
os.path.join("tests", "testdata", "vi", "viscii.txt"),
os.path.join("tests", "testdata", "zh", "euc-tw.txt"),
]

SKIP_LIST_02.extend(SKIP_LIST)
Expand All @@ -35,7 +35,7 @@ def test_ascii():


@pytest.mark.parametrize(
"testfile", glob.glob(os.path.join("src", "tests", "testdata", "*", "*.txt"))
"testfile", glob.glob(os.path.join("tests", "testdata", "*", "*.txt"))
)
def test_detect(testfile):
if testfile.replace("\\", "/") in SKIP_LIST:
Expand All @@ -57,7 +57,6 @@ def test_detector():
detector = cchardet.UniversalDetector()
with open(
os.path.join(
"src",
"tests",
"samples",
"wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt",
Expand Down Expand Up @@ -89,7 +88,7 @@ def test_github_issue_20():


def test_decode():
testfiles = glob.glob(os.path.join("src", "tests", "testdata", "*", "*.txt"))
testfiles = glob.glob(os.path.join("tests", "testdata", "*", "*.txt"))
for testfile in testfiles:
if testfile.replace("\\", "/") in SKIP_LIST_02:
continue
Expand All @@ -109,6 +108,7 @@ def test_decode():
raise e


@pytest.mark.skipif()
def test_utf8_with_bom():
sample = b"\xEF\xBB\xBF"
detected_encoding = cchardet.detect(sample)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Bob�s Burgers
3 changes: 3 additions & 0 deletions src/tests/samples/iso8859-2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id,name
1,english
2,�
Loading

0 comments on commit 27ea468

Please sign in to comment.