unicode: Preliminary commit, work-in-progress

mehmet-biter · Feb 6, 2018 · d4cf5f6 · d4cf5f6
1 parent a4ff12d
commit d4cf5f6
Show file tree

Hide file tree

Showing 30 changed files with 159 additions and 667 deletions.
diff --git a/Abbreviations.h b/Abbreviations.h
@@ -2,8 +2,8 @@
 
 #ifndef GB_ABBREVIATIONS_H
 #define GB_ABBREVIATIONS_H
+#include <inttypes.h>
 
-#include "Unicode.h"
 
 // . is the word with this word id an abbreviation?
 // . word id is just the hash64() of the word

diff --git a/ByteOrderMark.cpp b/ByteOrderMark.cpp
@@ -0,0 +1,24 @@
+#include "ByteOrderMark.h"
+#include <stddef.h>
+
+
+const char *ucDetectBOM(const char *buf, int32_t bufsize){
+	if (bufsize < 4) return NULL;
+	// copied from ICU
+	if(buf[0] == '\xFE' && buf[1] == '\xFF') {
+		return  "UTF-16BE";
+	} else if(buf[0] == '\xFF' && buf[1] == '\xFE') {
+		if(buf[2] == '\x00' && buf[3] =='\x00') {
+			return "UTF-32LE";
+		} else {
+			return  "UTF-16LE";
+		}
+	} else if(buf[0] == '\xEF' && buf[1] == '\xBB' && buf[2] == '\xBF') {
+		return  "UTF-8";
+	} else if(buf[0] == '\x00' && buf[1] == '\x00' && 
+		  buf[2] == '\xFE' && buf[3]=='\xFF') {
+		return  "UTF-32BE";
+	}
+
+	return NULL;
+}
diff --git a/ByteOrderMark.h b/ByteOrderMark.h
@@ -0,0 +1,16 @@
+#ifndef UNICODE_BOM_H_
+#define UNICODE_BOM_H_
+#include <inttypes.h>
+
+
+//Try to detect the Byte Order Mark of a Unicode Document
+//Returns either of:
+//	"UTF-16BE"
+//	"UTF-32LE"
+//	"UTF-16LE"
+//	"UTF-8"
+//	"UTF-32BE"
+//	NULL
+const char *ucDetectBOM(const char *buf, int32_t bufsize);
+
+#endif
diff --git a/Entities.cpp b/Entities.cpp
@@ -1,7 +1,6 @@
 #include "gb-include.h"
 
 #include "Entities.h"
-#include "Unicode.h"
 #include "HashTableX.h"
 #include "Process.h"
 #include "GbMutex.h"

diff --git a/Entities.h b/Entities.h
@@ -3,7 +3,6 @@
 
 // Matt Wells, copyright Jul 2001
 
-#include "Unicode.h"
 
 int32_t getEntity_a ( const char *s, int32_t maxLen, uint32_t codepoint[2], int32_t *codepointCount, int32_t *utf8Len );
 

diff --git a/GbEncoding.cpp b/GbEncoding.cpp
@@ -1,7 +1,7 @@
 #include "GbEncoding.h"
 #include "HttpMime.h"
 #include "iana_charset.h"
-#include "Unicode.h"
+#include "ByteOrderMark.h"
 #include "fctypes.h"
 
 #include "Log.h"
@@ -355,4 +355,4 @@ uint16_t GbEncoding::getCharset(HttpMime *mime, const char *url, const char *s,
 
 	// all done
 	return charset;
-}
+}
diff --git a/Json.h b/Json.h
@@ -8,7 +8,6 @@
 #define JT_OBJECT 6
 
 #include "gb-include.h"
-#include "Unicode.h"
 #include "SafeBuf.h"
 
 #define MAXJSONPARENTS 64

diff --git a/Makefile b/Makefile
@@ -51,7 +51,7 @@ OBJS_O2 = \
 	Rdb.o RdbBase.o \
 	Sections.o Spider.o SpiderCache.o SpiderColl.o SpiderLoop.o StopWords.o Summary.o \
 	Title.o \
-	UCPropTable.o UdpServer.o Unicode.o UnicodeProperties.o utf8.o utf8_fast.o utf8_convert.o \
+	UCPropTable.o UdpServer.o \
 	Words.o \
 	Xml.o XmlDoc.o XmlDoc_Indexing.o XmlNode.o \
 
@@ -83,6 +83,8 @@ OBJS_O3 = \
 	GbEncoding.o GbLanguage.o \
 	GbDns.o \
 	InstanceInfoExchange.o \
+	ByteOrderMark.o \
+	utf8.o utf8_fast.o utf8_convert.o \
 
 OBJS = $(OBJS_O0) $(OBJS_O1) $(OBJS_O2) $(OBJS_O3)
 
@@ -282,9 +284,9 @@ all: gb
 
 
 # third party libraries
-LIBFILES = libcld2_full.so libcld3.so libced.so libcares.so slacktee.sh libword_variations.a libsto.a
+LIBFILES = libcld2_full.so libcld3.so libced.so libcares.so slacktee.sh libword_variations.a libsto.a libunicode.a
 LIBS += -Wl,-rpath=. -L. -lcld2_full -lcld3 -lprotobuf -lced -lcares
-LIBS += -lword_variations -lsto
+LIBS += -lword_variations -lsto -lunicode
 
 CLD2_SRC_DIR=third-party/cld2/internal
 libcld2_full.so:
@@ -325,6 +327,12 @@ libsto.a:
 	$(MAKE) -C sto/
 	ln -sf sto/libsto.a libsto.a
 
+PHONY: libunicode.a
+libunicode.a:
+	$(MAKE) -C unicode/
+	./copy_changed_files.sh ucdata/ unicode/*.dat
+	ln -sf unicode/libunicode.a libunicode.a
+
 wanted_check_api.so: WantedCheckExampleLib.o
 	$(CXX) WantedCheckExampleLib.o -shared -o $@
 WantedCheckExampleLib.o: WantedCheckExampleLib.cpp

diff --git a/PageResults.cpp b/PageResults.cpp
@@ -16,7 +16,6 @@
 #include "Bits.h"
 #include "sort.h"
 #include "CountryCode.h"
-#include "Unicode.h"
 #include "Posdb.h"
 #include "PosdbTable.h"
 #include "PageResults.h"

diff --git a/Parms.cpp b/Parms.cpp
@@ -13,7 +13,6 @@
 #include "Collectiondb.h"
 #include "HttpMime.h"      // atotime()
 #include "SearchInput.h"
-#include "Unicode.h"
 #include "Spider.h" // MAX_SPIDER_PRIORITIES
 #include "SpiderColl.h"
 #include "SpiderLoop.h"

diff --git a/Process.cpp b/Process.cpp
@@ -61,7 +61,6 @@ extern void resetDomains       ( );
 extern void resetEntities      ( );
 extern void resetQuery         ( );
 extern void resetAbbrTable     ( );
-extern void resetUnicode       ( );
 
 // our global instance
 Process g_process;
@@ -73,12 +72,16 @@ static const char * const g_files[] = {
 
 	//"hosts.conf",
 
-	"ucdata/kd_data.dat",
-	"ucdata/kdmap.dat",
-	"ucdata/lowermap.dat",
-	"ucdata/properties.dat",
-	"ucdata/scripts.dat",
-	"ucdata/uppermap.dat",
+	"ucdata/unicode_canonical_decomposition.dat",
+	"ucdata/unicode_general_categories.dat",
+	"ucdata/unicode_is_alphabetic.dat",
+	"ucdata/unicode_is_lowercase.dat",
+	"ucdata/unicode_is_uppercase.dat",
+	"ucdata/unicode_properties.dat",
+	"ucdata/unicode_scripts.dat",
+	"ucdata/unicode_to_lowercase.dat",
+	"ucdata/unicode_to_uppercase.dat",
+	"ucdata/unicode_wordchars.dat",
 
 	"gbstart.sh",
 	"gbconvert.sh",
@@ -994,7 +997,7 @@ void Process::resetAll ( ) {
 	g_speller         .reset();
 	g_spiderCache     .reset();
 	g_jobScheduler    .finalize();
-	ucResetMaps();
+	UnicodeMaps::unload_maps();
 	utf8_convert_finalize();
 	g_profiler        .reset();
 
@@ -1012,15 +1015,15 @@ void Process::resetAll ( ) {
 	s_clusterdbQuickCache.reset();
 	s_hammerCache.reset();
 
-	resetDecompTables();
+	UnicodeMaps::unload_maps();
 	resetPageAddUrl();
 	resetHttpMime();
 	reset_iana_charset();
 	resetDomains();
 	resetEntities();
 	resetQuery();
 	resetAbbrTable();
-	resetUnicode();
+	UnicodeMaps::unload_maps();
 
 	// reset other caches
 	g_dns.reset();

diff --git a/Tagdb.cpp b/Tagdb.cpp
@@ -5,7 +5,6 @@
 #include "Tagdb.h"
 #include "Conf.h"       // for setting rdb from Conf file
 #include "Collectiondb.h"
-#include "Unicode.h"
 #include "JobScheduler.h"
 #include "HttpServer.h"
 #include "HttpRequest.h"

diff --git a/Unicode.cpp b/Unicode.cpp
diff --git a/Unicode.h b/Unicode.h
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,7 +3,6 @@

		// Matt Wells, copyright Jul 2001

		#include "Unicode.h"

		int32_t getEntity_a ( const char s, int32_t maxLen, uint32_t codepoint[2], int32_t codepointCount, int32_t *utf8Len );

Expand Down