Skip to content

Commit

Permalink
unicode: Preliminary commit, work-in-progress
Browse files Browse the repository at this point in the history
  • Loading branch information
isj-privacore committed Feb 6, 2018
1 parent a4ff12d commit d4cf5f6
Show file tree
Hide file tree
Showing 30 changed files with 159 additions and 667 deletions.
2 changes: 1 addition & 1 deletion Abbreviations.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

#ifndef GB_ABBREVIATIONS_H
#define GB_ABBREVIATIONS_H
#include <inttypes.h>

#include "Unicode.h"

// . is the word with this word id an abbreviation?
// . word id is just the hash64() of the word
Expand Down
24 changes: 24 additions & 0 deletions ByteOrderMark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#include "ByteOrderMark.h"
#include <stddef.h>


const char *ucDetectBOM(const char *buf, int32_t bufsize){
if (bufsize < 4) return NULL;
// copied from ICU
if(buf[0] == '\xFE' && buf[1] == '\xFF') {
return "UTF-16BE";
} else if(buf[0] == '\xFF' && buf[1] == '\xFE') {
if(buf[2] == '\x00' && buf[3] =='\x00') {
return "UTF-32LE";
} else {
return "UTF-16LE";
}
} else if(buf[0] == '\xEF' && buf[1] == '\xBB' && buf[2] == '\xBF') {
return "UTF-8";
} else if(buf[0] == '\x00' && buf[1] == '\x00' &&
buf[2] == '\xFE' && buf[3]=='\xFF') {
return "UTF-32BE";
}

return NULL;
}
16 changes: 16 additions & 0 deletions ByteOrderMark.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef UNICODE_BOM_H_
#define UNICODE_BOM_H_
#include <inttypes.h>


//Try to detect the Byte Order Mark of a Unicode Document
//Returns either of:
// "UTF-16BE"
// "UTF-32LE"
// "UTF-16LE"
// "UTF-8"
// "UTF-32BE"
// NULL
const char *ucDetectBOM(const char *buf, int32_t bufsize);

#endif
1 change: 0 additions & 1 deletion Entities.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include "gb-include.h"

#include "Entities.h"
#include "Unicode.h"
#include "HashTableX.h"
#include "Process.h"
#include "GbMutex.h"
Expand Down
1 change: 0 additions & 1 deletion Entities.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

// Matt Wells, copyright Jul 2001

#include "Unicode.h"

int32_t getEntity_a ( const char *s, int32_t maxLen, uint32_t codepoint[2], int32_t *codepointCount, int32_t *utf8Len );

Expand Down
4 changes: 2 additions & 2 deletions GbEncoding.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "GbEncoding.h"
#include "HttpMime.h"
#include "iana_charset.h"
#include "Unicode.h"
#include "ByteOrderMark.h"
#include "fctypes.h"

#include "Log.h"
Expand Down Expand Up @@ -355,4 +355,4 @@ uint16_t GbEncoding::getCharset(HttpMime *mime, const char *url, const char *s,

// all done
return charset;
}
}
1 change: 0 additions & 1 deletion Json.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#define JT_OBJECT 6

#include "gb-include.h"
#include "Unicode.h"
#include "SafeBuf.h"

#define MAXJSONPARENTS 64
Expand Down
14 changes: 11 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ OBJS_O2 = \
Rdb.o RdbBase.o \
Sections.o Spider.o SpiderCache.o SpiderColl.o SpiderLoop.o StopWords.o Summary.o \
Title.o \
UCPropTable.o UdpServer.o Unicode.o UnicodeProperties.o utf8.o utf8_fast.o utf8_convert.o \
UCPropTable.o UdpServer.o \
Words.o \
Xml.o XmlDoc.o XmlDoc_Indexing.o XmlNode.o \

Expand Down Expand Up @@ -83,6 +83,8 @@ OBJS_O3 = \
GbEncoding.o GbLanguage.o \
GbDns.o \
InstanceInfoExchange.o \
ByteOrderMark.o \
utf8.o utf8_fast.o utf8_convert.o \

OBJS = $(OBJS_O0) $(OBJS_O1) $(OBJS_O2) $(OBJS_O3)

Expand Down Expand Up @@ -282,9 +284,9 @@ all: gb


# third party libraries
LIBFILES = libcld2_full.so libcld3.so libced.so libcares.so slacktee.sh libword_variations.a libsto.a
LIBFILES = libcld2_full.so libcld3.so libced.so libcares.so slacktee.sh libword_variations.a libsto.a libunicode.a
LIBS += -Wl,-rpath=. -L. -lcld2_full -lcld3 -lprotobuf -lced -lcares
LIBS += -lword_variations -lsto
LIBS += -lword_variations -lsto -lunicode

CLD2_SRC_DIR=third-party/cld2/internal
libcld2_full.so:
Expand Down Expand Up @@ -325,6 +327,12 @@ libsto.a:
$(MAKE) -C sto/
ln -sf sto/libsto.a libsto.a

PHONY: libunicode.a
libunicode.a:
$(MAKE) -C unicode/
./copy_changed_files.sh ucdata/ unicode/*.dat
ln -sf unicode/libunicode.a libunicode.a

wanted_check_api.so: WantedCheckExampleLib.o
$(CXX) WantedCheckExampleLib.o -shared -o $@
WantedCheckExampleLib.o: WantedCheckExampleLib.cpp
Expand Down
1 change: 0 additions & 1 deletion PageResults.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#include "Bits.h"
#include "sort.h"
#include "CountryCode.h"
#include "Unicode.h"
#include "Posdb.h"
#include "PosdbTable.h"
#include "PageResults.h"
Expand Down
1 change: 0 additions & 1 deletion Parms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#include "Collectiondb.h"
#include "HttpMime.h" // atotime()
#include "SearchInput.h"
#include "Unicode.h"
#include "Spider.h" // MAX_SPIDER_PRIORITIES
#include "SpiderColl.h"
#include "SpiderLoop.h"
Expand Down
23 changes: 13 additions & 10 deletions Process.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ extern void resetDomains ( );
extern void resetEntities ( );
extern void resetQuery ( );
extern void resetAbbrTable ( );
extern void resetUnicode ( );

// our global instance
Process g_process;
Expand All @@ -73,12 +72,16 @@ static const char * const g_files[] = {

//"hosts.conf",

"ucdata/kd_data.dat",
"ucdata/kdmap.dat",
"ucdata/lowermap.dat",
"ucdata/properties.dat",
"ucdata/scripts.dat",
"ucdata/uppermap.dat",
"ucdata/unicode_canonical_decomposition.dat",
"ucdata/unicode_general_categories.dat",
"ucdata/unicode_is_alphabetic.dat",
"ucdata/unicode_is_lowercase.dat",
"ucdata/unicode_is_uppercase.dat",
"ucdata/unicode_properties.dat",
"ucdata/unicode_scripts.dat",
"ucdata/unicode_to_lowercase.dat",
"ucdata/unicode_to_uppercase.dat",
"ucdata/unicode_wordchars.dat",

"gbstart.sh",
"gbconvert.sh",
Expand Down Expand Up @@ -994,7 +997,7 @@ void Process::resetAll ( ) {
g_speller .reset();
g_spiderCache .reset();
g_jobScheduler .finalize();
ucResetMaps();
UnicodeMaps::unload_maps();
utf8_convert_finalize();
g_profiler .reset();

Expand All @@ -1012,15 +1015,15 @@ void Process::resetAll ( ) {
s_clusterdbQuickCache.reset();
s_hammerCache.reset();

resetDecompTables();
UnicodeMaps::unload_maps();
resetPageAddUrl();
resetHttpMime();
reset_iana_charset();
resetDomains();
resetEntities();
resetQuery();
resetAbbrTable();
resetUnicode();
UnicodeMaps::unload_maps();

// reset other caches
g_dns.reset();
Expand Down
1 change: 0 additions & 1 deletion Tagdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#include "Tagdb.h"
#include "Conf.h" // for setting rdb from Conf file
#include "Collectiondb.h"
#include "Unicode.h"
#include "JobScheduler.h"
#include "HttpServer.h"
#include "HttpRequest.h"
Expand Down
105 changes: 0 additions & 105 deletions Unicode.cpp

This file was deleted.

28 changes: 0 additions & 28 deletions Unicode.h

This file was deleted.

Loading

0 comments on commit d4cf5f6

Please sign in to comment.