Skip to content

Commit

Permalink
Support dynamic TLD list
Browse files Browse the repository at this point in the history
  • Loading branch information
isj-privacore committed Aug 31, 2018
1 parent 3a0c267 commit 62b091b
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 25 deletions.
2 changes: 1 addition & 1 deletion Docid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@ void CompatibleUrl::set(const char *t, int32_t tlen) {
}
// . otherwise, get the tld
// . uses thorough list of tlds in Domains.cpp
else if ( ( m_tld = ::getTLD ( m_host, m_hlen ) ) && m_tld > m_host ) {
else if ( ( m_tld = ::getTLD_static ( m_host, m_hlen ) ) && m_tld > m_host ) {
// set m_domain if we had a tld that's not equal to our host
m_tldLen = strlen ( m_tld );
m_domain = ::getDomain ( m_host , m_hlen , m_tld , &m_dlen );
Expand Down
136 changes: 112 additions & 24 deletions Domains.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "ScopedLock.h"

static bool isTLDForUrl(const char *tld, int32_t tldLen);
static bool isTLDForUrl_static(const char *tld, int32_t tldLen);


char *getDomainOfIp ( char *host , int32_t hostLen , int32_t *dlen ) {
Expand Down Expand Up @@ -97,14 +98,62 @@ const char *getTLD ( const char *host , int32_t hostLen ) {
return tld;
}

const char *getTLD_static(const char *host, int32_t hostLen) {
if(hostLen==0)
return NULL;
// make "s" point to last period in the host
//char *s = host + strlen(host) - 1;
const char *hostEnd = host + hostLen;
const char *s = hostEnd - 1;
while ( s > host && *s !='.' ) s--;
// point to the tld in question
const char *t = s;
if ( *t == '.' ) t++;
// reset our current tld ptr
const char *tld = NULL;
// is t a valid tld? if so, set "tld" to "t".
if ( isTLDForUrl_static ( t , hostEnd - t ) ) tld = t;
// host had no period at most we had just a tld so return NULL
if ( s == host ) return tld;

// back up over last period
s--;
// just because it's in table doesn't mean we can't try going up more
while ( s > host && *s !='.' ) s--;
// point to the tld in question
t = s;
if ( *t == '.' ) t++;
// is t a valid tld? if so, set "tld" to "t".
if ( isTLDForUrl_static ( t , hostEnd - t ) ) tld = t;
// host had no period at most we had just a tld so return NULL
if ( s == host ) return tld;


// . now only 1 tld has 2 period and that is "LKD.CO.IM"
// . so waste another iteration for that (TODO: speed up?)
// . back up over last period
s--;
// just because it's in table doesn't mean we can't try going up more
while ( s > host && *s !='.' ) s--;
// point to the tld in question
t = s;
if ( *t == '.' ) t++;
// is t a valid tld? if so, set "tld" to "t".
if ( isTLDForUrl_static ( t , hostEnd - t ) ) tld = t;
// we must have gotten the tld by this point, if there was a valid one
return tld;

}


static HashTableX s_table;
static GbMutex s_tableMutex;
static HashTableX s_staticTLDTable;
static GbMutex s_staticTLDTableMutex;
static HashTableX s_dynamicTLDTable;
static GbMutex s_dynamicTLDTableMutex;

//Docids depend partially on domains (bit 6-13, see Titledb::getProbableDocId() and Url::getDomain()), so we cannot use a dynamic TLD list that ends up
//getting used by Url::getDomain(). So reverted back to hardcoded list, sadly.
//#include "tlds.inc"
static const char * const s_tlds[] = {
//We have both a dynamic / semi-dynamic list of TLDs and a hardcoded one. The hardcoded one must be used Docid::getProbableDocid() so docids are stable. All other code hsould use the dynamic list
#include "tlds.inc"
static const char * const s_staticTLDs[] = {
"AAA",
"AARP",
"ABB",
Expand Down Expand Up @@ -1874,7 +1923,7 @@ static bool loadTLDs(const char *data_dir) {
size_t dlen = strlen(line);
int64_t dh = hash64Lower_a(line, dlen);
//todo/future: encode from utf8 to punycode
if(!s_table.addKey (&dh,NULL)) {
if(!s_dynamicTLDTable.addKey (&dh,NULL)) {
log(LOG_WARN, "build: dom table failed");
fclose(fp);
return false;
Expand All @@ -1886,25 +1935,42 @@ static bool loadTLDs(const char *data_dir) {
return true;
}

static bool initializeTLDTable(const char *data_dir) {
s_table.reset();
if(!s_table.set(8, 0, sizeof(s_tlds)*2,NULL,0,false, "tldtbl")) {
static bool initializeTLDTables(const char *data_dir) {
s_staticTLDTable.reset();
if(!s_staticTLDTable.set(8, 0, sizeof(s_tlds)*2,NULL,0,false, "tldtbl")) {
log(LOG_WARN, "build: Could not init table of TLDs.");
return false;
}
s_dynamicTLDTable.reset();
if(!s_dynamicTLDTable.set(8, 0, sizeof(s_tlds)*2,NULL,0,false, "tldtbl")) {
log(LOG_WARN, "build: Could not init table of TLDs.");
return false;
}

//if(!loadTLDs(data_dir)) {
//use burned-in default
for(int32_t i = 0; s_tlds[i]; i++) {
const char *d = s_tlds[i];
//use hardcoded list for static TLD list
for(int32_t i = 0; s_staticTLDs[i]; i++) {
const char *d = s_staticTLDs[i];
int32_t dlen = strlen (d);
int64_t dh = hash64Lower_a(d, dlen);
if(!s_table.addKey (&dh,NULL)) {
if(!s_staticTLDTable.addKey (&dh,NULL)) {
log( LOG_WARN, "build: dom table failed");
return false;
}
}
//}

//use dynamic or compile-time list for dynamic TLD list
if(!loadTLDs(data_dir)) {
//use burned-in default
for(int32_t i = 0; s_tlds[i]; i++) {
const char *d = s_tlds[i];
int32_t dlen = strlen (d);
int64_t dh = hash64Lower_a(d, dlen);
if(!s_dynamicTLDTable.addKey (&dh,NULL)) {
log( LOG_WARN, "build: dom table failed");
return false;
}
}
}
return true;
}

Expand All @@ -1921,32 +1987,54 @@ static bool isTLDForUrl(const char *tld, int32_t tldLen) {

// otherwise, if one period, check table to see if qualified

if( ! s_table.getNumSlots() ) {
if( ! s_dynamicTLDTable.getNumSlots() ) {
log(LOG_ERROR,"%s:%d: Attempted to use uninitialized TLD table", __func__, __LINE__);
gbshutdownLogicError();
}

int64_t h = hash64Lower_a ( tld , tldLen );
bool b = s_dynamicTLDTable.isInTable ( &h );
return b;
}

static bool isTLDForUrl_static(const char *tld, int32_t tldLen) {
int32_t pcount = 0;
for ( int32_t i = 0 ; i < tldLen ; i++ ) {
if ( tld[i] == '.' ) { pcount++; continue; }
if ( ! is_alnum_a(tld[i]) && tld[i] != '-' ) return false;
}

if ( pcount == 0 ) return true;
if ( pcount >= 2 ) return false;

// otherwise, if one period, check table to see if qualified

if( ! s_staticTLDTable.getNumSlots() ) {
log(LOG_ERROR,"%s:%d: Attempted to use uninitialized TLD table", __func__, __LINE__);
gbshutdownLogicError();
}

int64_t h = hash64Lower_a ( tld , tldLen ); // strlen(tld));
//return s_table.isInTable ( &h );//getScoreFromTermId ( h );
bool b = s_table.isInTable ( &h );//getScoreFromTermId ( h );
int64_t h = hash64Lower_a ( tld , tldLen );
bool b = s_staticTLDTable.isInTable ( &h );
return b;
}


bool isTLD(const char *tld, int32_t tldLen) {
if( ! s_table.getNumSlots() ) {
if( ! s_dynamicTLDTable.getNumSlots() ) {
log(LOG_ERROR,"%s:%d: Attempted to use uninitialized TLD table", __func__, __LINE__);
gbshutdownLogicError();
}
int64_t h = hash64Lower_a(tld, tldLen);
return s_table.isInTable(&h);
return s_dynamicTLDTable.isInTable(&h);
}


bool initializeDomains(const char *data_dir) {
return initializeTLDTable(data_dir);
return initializeTLDTables(data_dir);
}

void finalizeDomains() {
s_table.reset();
s_staticTLDTable.reset();
s_dynamicTLDTable.reset();
}
2 changes: 2 additions & 0 deletions Domains.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ const char *getDomainOfIp(const char *host, int32_t hostLen, int32_t *dlen);

// used by getDomain() above
const char *getTLD ( const char *host , int32_t hostLen ) ;
//eternally stable output by using non-dynamic list of TLDs
const char *getTLD_static(const char *host, int32_t hostLen);

//is the string (com or co.uk) a known TLD?
bool isTLD(const char *tld, int32_t tldLen);
Expand Down

0 comments on commit 62b091b

Please sign in to comment.