JuliaStrings · chris0e3 · Sep 16, 2021 · Sep 16, 2021 · Sep 17, 2021 · stevengj
diff --git a/Makefile b/Makefile
@@ -26,6 +26,8 @@ MAJOR=2
 MINOR=4
 PATCH=1
 
+UNICODE_VERSION ?= 14.0.0
+
 OS := $(shell uname)
 ifeq ($(OS),Darwin) # MacOS X
   SHLIB_EXT = dylib
@@ -70,7 +72,7 @@ manifest: MANIFEST.new
 
 # real targets
 
-data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT) data/data_generator.rb data/charwidths.jl
+data/utf8proc_data.c.new: data_make.py
 	$(MAKE) -C data utf8proc_data.c.new
 
 utf8proc.o: utf8proc.h utf8proc.c utf8proc_data.c
@@ -166,7 +168,7 @@ test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
 	$(CC) $(UCFLAGS) $(LDFLAGS) test/custom.c test/tests.o utf8proc.o -o $@
 
 test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
-	$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@
+	$(CC) $(UCFLAGS) $(LDFLAGS) '-DUNICODE_VERSION="$(UNICODE_VERSION)"' test/misc.c test/tests.o utf8proc.o -o $@
 
 check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
 	$(MAKE) -C bench

diff --git a/data/Makefile b/data/Makefile
@@ -1,63 +1,57 @@
 # Unicode data generation rules.  Except for the test data files, most
 # users will not use these Makefile rules, which are primarily to re-generate
-# unicode_data.c when we get a new Unicode version or charwidth data; they
-# require ruby and julia to be installed.
+# unicode_data.c when we get a new Unicode version or charwidth data.
+# Requires python 3.7+, curl & sed to be installed.
 
-# programs
-CURL=curl
-RUBY=ruby
-PERL=perl
-MAKE=make
-JULIA=julia
-CURLFLAGS = --retry 5 --location
+CURL = /usr/bin/curl --retry 5 --location
 
 .PHONY: clean
 
 .DELETE_ON_ERROR:
 
-utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt emoji-data.txt
-	$(RUBY) data_generator.rb < UnicodeData.txt > $@
-
-CharWidths.txt: charwidths.jl EastAsianWidth.txt
-	$(JULIA) charwidths.jl > $@
+utf8proc_data.c.new: ../data_make.py UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt \
+					 CompositionExclusions.txt CaseFolding.txt emoji-data.txt EastAsianWidth.txt
+	../data_make.py --format=1 --fix26 --output $@ .
 
 # Unicode data version (must also update utf8proc_unicode_version function)
-UNICODE_VERSION=13.0.0
+UNICODE_VERSION?=14.0.0
+URL_ROOT = $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd
 
 UnicodeData.txt:
-	$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
+	$(CURL) -o $@ $(URL_ROOT)/UnicodeData.txt
 
 EastAsianWidth.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/EastAsianWidth.txt
+	$(CURL) -o $@ $(URL_ROOT)/EastAsianWidth.txt
 
 GraphemeBreakProperty.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakProperty.txt
+	$(CURL) -o $@ $(URL_ROOT)/auxiliary/GraphemeBreakProperty.txt
 
 DerivedCoreProperties.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/DerivedCoreProperties.txt
+	$(CURL) -o $@ $(URL_ROOT)/DerivedCoreProperties.txt
 
 CompositionExclusions.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/CompositionExclusions.txt
+	$(CURL) -o $@ $(URL_ROOT)/CompositionExclusions.txt
 
 CaseFolding.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/CaseFolding.txt
+	$(CURL) -o $@ $(URL_ROOT)/CaseFolding.txt
 
 NormalizationTest.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/NormalizationTest.txt
+	$(CURL) -o $@ $(URL_ROOT)/NormalizationTest.txt
 
 GraphemeBreakTest.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakTest.txt
+	$(CURL) -o $@ $(URL_ROOT)/auxiliary/GraphemeBreakTest.txt
 
 emoji-data.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://unicode.org/Public/$(UNICODE_VERSION)/ucd/emoji/emoji-data.txt
+	$(CURL) -o $@ $(URL_ROOT)/emoji/emoji-data.txt
 
 Uppercase.txt: DerivedCoreProperties.txt
-	$(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]' > $@
+	sed -En '/^# Derived Property: Uppercase/,/^# Total code points:/p' DerivedCoreProperties.txt > $@
 
 Lowercase.txt: DerivedCoreProperties.txt
-	$(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]' > $@
+	sed -En '/^# Derived Property: Lowercase/,/^# Total code points:/p' DerivedCoreProperties.txt > $@
 
 clean:
-	rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt emoji-data.txt
-	rm -f Uppercase.txt Lowercase.txt
+	rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt \
+		  CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt \
+		  emoji-data.txt Uppercase.txt Lowercase.txt
 	rm -f utf8proc_data.c.new