-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDocid.cpp
697 lines (593 loc) · 19 KB
/
Docid.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
#include "Docid.h"
#include "Url.h"
#include "hash.h"
#include "Titledb.h" //DOCID_MASK
#include "Punycode.h"
#include "UrlParser.h"
#include "gbmemcpy.h"
#include "Domains.h"
#include "ip.h" // atoip ( s,len)
#include <algorithm>
#include <stdio.h>
#ifdef _VALGRIND_
#include <valgrind/memcheck.h>
#endif
namespace {
//trimmed-down version of Url which calls the static getTLD()
struct CompatibleUrl {
char m_url[MAX_URL_LEN];
int32_t m_ulen;
char *m_host;
int32_t m_hlen;
const char *m_domain;
int32_t m_dlen;
const char *m_tld;
int32_t m_tldLen;
void set(const char *t, int32_t tlen);
};
} //anonymous namespace
static uint64_t getProbableDocId(const char *url, const char *dom, int32_t domLen) {
uint64_t probableDocId = hash64b(url,0) & DOCID_MASK;
// clear bits 6-13 because we want to put the domain hash there
// dddddddd dddddddd ddhhhhhh hhdddddd
probableDocId &= 0xffffffffffffc03fULL;
uint32_t h = hash8(dom,domLen);
//shift the hash by 6
h <<= 6;
// OR in the hash
probableDocId |= h;
return probableDocId;
}
static uint64_t getProbableDocId(const CompatibleUrl *url) {
return ::getProbableDocId(url->m_url, url->m_domain, url->m_dlen);
}
uint64_t Docid::getProbableDocId(const Url *url) {
CompatibleUrl u;
u.set(url->getUrl(),url->getUrlLen());
return ::getProbableDocId(u.m_url, u.m_domain, u.m_dlen);
}
uint64_t Docid::getProbableDocId(const char *url) {
CompatibleUrl u;
u.set(url,strlen(url));
return ::getProbableDocId(&u);
}
uint64_t Docid::getFirstProbableDocId(int64_t d) {
return d & 0xffffffffffffffc0ULL;
}
uint64_t Docid::getLastProbableDocId(int64_t d) {
return d | 0x000000000000003fULL;
}
uint8_t Docid::getDomHash8FromDocId (int64_t d) {
return (d & ~0xffffffffffffc03fULL) >> 6;
}
//Copied from Url::set()
//Lots of quiestionable code. Not much we can do.
//The purpose is to keep this code calling the static-list getTLD() so we have freedom to modify the normal Url class
void CompatibleUrl::set(const char *t, int32_t tlen) {
bool addWWW = false;
bool stripParams=false;
bool stripCommonFile=false;
int32_t titledbVersion = 129;
#ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED(t,tlen);
#endif
char *m_scheme = NULL;
m_host = NULL;
char *m_path = NULL;
char *m_filename = NULL;
char *m_extension = NULL;
char *m_query = NULL;
m_domain = NULL;
m_tld = NULL;
m_url[0] = '\0';
m_ulen = 0;
m_dlen = 0;
int32_t m_slen = 0;
int32_t m_qlen = 0;
m_hlen = 0;
int32_t m_elen = 0;
int32_t m_mdlen = 0;
// Coverity
int32_t m_plen = 0;
int32_t m_flen = 0;
m_tldLen = 0;
int32_t m_port = 0;
int32_t m_defPort = 0;
int32_t m_portLen = 0;
char *m_portPtr = nullptr;
int32_t m_portPtrLen = 0;
if (!t || tlen == 0) {
return;
}
// we may add a "www." a trailing backslash and \0, ...
if (tlen > MAX_URL_LEN - 10) {
log( LOG_LIMIT, "db: Encountered url of length %" PRId32 ". Truncating to %i", tlen, MAX_URL_LEN - 10 );
tlen = MAX_URL_LEN - 10;
}
char stripped[MAX_URL_LEN];
if (titledbVersion >= 125) {
// skip starting spaces
while (tlen > 0 && is_wspace_a(*t)) {
++t;
--tlen;
}
// remove tab/cr/lf
std::string url(t, tlen);
url.erase(std::remove_if(url.begin(), url.end(), [](char c) { return c == 0x09 || c == 0x0A || c == 0x0D; }), url.end());
memcpy(stripped, url.c_str(), url.size());
stripped[url.size()] = '\0';
t = stripped;
tlen = url.size();
// skip ending spaces
while (tlen > 0 && is_wspace_a(t[tlen - 1])) {
--tlen;
}
}
// . skip over non-alnum chars (except - or /) in the beginning
// . if url begins with // then it's just missing the http: (slashdot)
// . watch out for hostname like: -dark-.deviantart.com(yes, it's real)
// . so all protocols are hostnames MUST start with alnum OR hyphen
while (tlen > 0 && !is_alnum_a(*t) && *t != '-' && *t != '/') {
t++;
tlen--;
}
// . stop t at first space or binary char
// . url should be in encoded form!
int32_t i;
int32_t nonAsciiPos = -1;
for ( i = 0 ; i < tlen ; i++ ) {
if (titledbVersion < 125 && is_wspace_a(t[i])) {
break;
}
if (!is_ascii(t[i])) {
// Sometimes the length with the null is passed in,
// so ignore nulls FIXME?
if (t[i]) {
nonAsciiPos = i;
}
break; // no non-ascii chars allowed
}
}
if ( nonAsciiPos != -1 ) {
// Try turning utf8 and latin1 encodings into punycode.
// All labels(between dots) in the domain are encoded
// separately. We don't support encoded tlds, but they are
// not widespread yet.
// If it is a non ascii domain it needs to take the form
// xn--<punycoded label>.xn--<punycoded label>.../
log(LOG_DEBUG, "build: attempting to decode unicode url %*.*s pos at %" PRId32, (int)tlen, (int)tlen, t, nonAsciiPos);
char encoded [ MAX_URL_LEN ];
size_t encodedLen = MAX_URL_LEN;
char *encodedDomStart = encoded;
const char *p = t;
const char *pend = t+tlen;
// Find the start of the domain
if ( tlen > 7 && strncmp( p, "http://", 7 ) == 0 ) {
p += 7;
} else if ( tlen > 8 && strncmp( p, "https://", 8 ) == 0 ) {
p += 8;
}
gbmemcpy(encodedDomStart, t, p-t);
encodedDomStart += p-t;
while (p < pend && *p != '/' && *p != ':') {
const char *labelStart = p;
uint32_t tmpBuf[MAX_URL_LEN];
int32_t tmpLen = 0;
while (p < pend && *p != '.' && *p != '/' &&
(titledbVersion < 125 || (titledbVersion >= 125 && *p != ':'))) {
p++;
}
int32_t labelLen = p - labelStart;
bool tryLatin1 = false;
// For utf8 urls
p = labelStart;
bool labelIsAscii = true;
// Convert the domain to code points and copy it to tmpbuf to be punycoded
for ( ; p - labelStart < labelLen; p += utf8Size( tmpBuf[tmpLen] ), tmpLen++ ) {
labelIsAscii = labelIsAscii && is_ascii( *p );
tmpBuf[tmpLen] = utf8Decode( p );
if ( !tmpBuf[tmpLen] ) { // invalid char?
tryLatin1 = true;
break;
}
}
if ( labelIsAscii ) {
if ( labelStart[labelLen] == '.' ) {
labelLen++;
p++;
}
gbmemcpy( encodedDomStart, labelStart, labelLen );
encodedDomStart += labelLen;
continue;
}
if ( tryLatin1 ) {
// For latin1 urls
tmpLen = 0;
for ( ; tmpLen < labelLen; tmpLen++ ) {
tmpBuf[tmpLen] = labelStart[tmpLen];
}
}
gbmemcpy( encodedDomStart, "xn--", 4 );
encodedDomStart += 4;
encodedLen = MAX_URL_LEN - (encodedDomStart - encoded);
punycode_status status = punycode_encode( tmpLen, tmpBuf, NULL, &encodedLen, encodedDomStart );
if ( status != 0 ) {
// Give up? try again?
log("build: Bad Engineer, failed to "
"punycode international url %s (%" PRId32 ")",
t, (int32_t)status);
return;
}
// We should check if what we encoded were valid url characters, no spaces, etc
// FIXME: should we exclude just the bad chars? I've seen plenty of urls with
// a newline in the middle. Just discard the whole chunk for now
bool badUrlChars = false;
for ( uint32_t i = 0; i < encodedLen; i++ ) {
if ( is_wspace_a( encodedDomStart[i] ) ) {
badUrlChars = true;
break;
}
}
if ( encodedLen == 0 || badUrlChars ) {
encodedDomStart -= 4; // don't need the xn--
p++;
} else {
encodedDomStart += encodedLen;
*encodedDomStart++ = *p++; // Copy in the . or the /
}
}
// p now points to the end of the domain
// encodedDomStart now points to the first free space in encoded string
// Now copy the rest of the url in. Watch out for non-ascii chars
// truncate the url, and keep it under max url length
uint32_t newUrlLen = encodedDomStart - encoded;
while (p < pend) {
if ( ! *p ) {
break; // null?
}
if (!is_ascii(*p)) {
// url encode utf8 characters now
char cs = getUtf8CharSize(p);
// bad utf8 char?
if ( !isValidUtf8Char(p) ) {
break;
}
int maxDestLen = (cs * 3) + 1; // %XX + \0
// too long?
if ( newUrlLen + maxDestLen >= MAX_URL_LEN ) {
break;
}
char stored = urlEncode(&encoded[newUrlLen], maxDestLen, p, cs);
p += cs;
newUrlLen += stored;
continue;
}
if (is_wspace_a(*p)) {
break;
}
if (newUrlLen + 1 >= MAX_URL_LEN) {
break;
}
encoded[newUrlLen++] = *p++;
}
encoded[newUrlLen] = '\0';
return this->set( encoded, newUrlLen );
}
// truncate length to the first occurence of an unacceptable char
tlen = i;
// . jump over http:// if it starts with http://http://
// . a common mistake...
while ( tlen > 14 && ! strncasecmp ( t , "http://http://" , 14 ) ) {
t += 7;
tlen -= 7;
}
// only strip anchor for version <= 122 (we're stripping anchor in UrlParser)
if (titledbVersion <= 122) {
// strip the "#anchor" from http://www.xyz.com/somepage.html#anchor"
for (int32_t i = 0; i < tlen; i++) {
if (t[i] == '#') {
// ignore anchor if a ! follows it. 'google hash bang hack'
// which breaks the web and is now deprecated, but, there it is
if (i + 1 < tlen && t[i + 1] == '!') {
continue;
}
tlen = i;
break;
}
}
}
// copy to "s" so we can NULL terminate it
char s[MAX_URL_LEN];
int32_t len = tlen;
if (titledbVersion <= 122) {
// store filtered url into s
memcpy(s, t, tlen);
s[len] = '\0';
if (stripParams) {
//stripParametersv122(s, &len);
}
} else {
UrlParser urlParser(t, tlen, titledbVersion);
if (stripParams) {
//stripParameters(&urlParser);
}
// rebuild url
urlParser.unparse();
len = urlParser.getUrlParsedLen();
if (len > MAX_URL_LEN - 10) {
len = MAX_URL_LEN - 10;
}
strncpy(s, urlParser.getUrlParsed(), len);
s[len] = '\0';
}
// remove common filenames like index.html
if ( stripCommonFile ) {
if ( len - 14 > 0 &&
strncasecmp(&s[len-14], "/default.xhtml", 14) == 0 )
len -= 13;
else if ( len - 13 > 0 &&
( strncasecmp(&s[len-13], "/default.html", 13) == 0 ||
strncasecmp(&s[len-13], "/default.ascx", 13) == 0 ||
strncasecmp(&s[len-13], "/default.ashx", 13) == 0 ||
strncasecmp(&s[len-13], "/default.asmx", 13) == 0 ||
strncasecmp(&s[len-13], "/default.xhtm", 13) == 0 ||
strncasecmp(&s[len-13], "/default.aspx", 13) == 0 ) )
len -= 12;
else if ( len - 12 > 0 &&
( strncasecmp(&s[len-12], "/default.htm", 12) == 0 ||
strncasecmp(&s[len-12], "/default.php", 12) == 0 ||
strncasecmp(&s[len-12], "/default.asp", 12) == 0 ||
strncasecmp(&s[len-12], "/index.xhtml", 12) == 0 ) )
len -= 11;
else if ( len - 11 > 0 &&
( strncasecmp(&s[len-11], "/index.html", 11) == 0 ||
strncasecmp(&s[len-11], "/index.aspx", 11) == 0 ||
strncasecmp(&s[len-11], "/index.xhtm", 11) == 0 ||
strncasecmp(&s[len-11], "/default.pl", 11) == 0 ||
strncasecmp(&s[len-11], "/default.cs", 11) == 0 ) )
len -= 10;
else if ( len - 10 > 0 &&
( strncasecmp(&s[len-10], "/index.htm", 10) == 0 ||
strncasecmp(&s[len-10], "/index.php", 10) == 0 ||
strncasecmp(&s[len-10], "/index.asp", 10) == 0 ||
strncasecmp(&s[len-10], "/main.html", 10) == 0 ||
strncasecmp(&s[len-10], "/main.aspx", 10) == 0 ) )
len -= 9;
else if ( len - 9 > 0 &&
( strncasecmp(&s[len-9], "/index.pl", 9) == 0 ||
strncasecmp(&s[len-9], "/main.htm", 9) == 0 ||
strncasecmp(&s[len-9], "/main.php", 9) == 0 ) )
len -= 8;
else if ( len - 8 > 0 &&
( strncasecmp(&s[len-8], "/main.pl", 8) == 0 ) )
len -= 7;
s[len] = '\0';
}
// replace the "\" with "/" -- a common mistake
int32_t j;
for ( j = 0 ; s[j] ; j++)
{
if (s[j]=='\\')
{
s[j]='/';
}
}
// . dig out the protocol/scheme for this s (check for ://)
// . protocol may only have alnums and hyphens in it
for ( i = 0 ; s[i] && (is_alnum_a(s[i]) || s[i]=='-') ; i++ );
// if we have a legal protocol, then set "m_scheme", "slen" and "sch"
// and advance i to the m_host
if ( i + 2 < len && s[i]==':' && s[i+1]=='/' && s[i+2]=='/')
{
// copy lowercase protocol to "m_url"
to_lower3_a ( s , i + 3 , m_url );
m_scheme = m_url;
m_slen = i;
m_ulen = i + 3;
i += 3;
}
else
if (i + 2 < len && s[i]==':' && s[i+1]=='/'&& is_alnum_a(s[i+2]))
{
// copy lowercase protocol to "m_url"
to_lower3_a ( s , i + 2 , m_url );
// add in needed /
m_url[i+2]='/';
m_scheme = m_url;
m_slen = i;
m_ulen = i + 3;
i += 2;
}
else
{
gbmemcpy ( m_url,"http://" , 7 );
m_scheme = m_url;
m_slen = 4;
m_ulen = 7;
i = 0;
// if s started with // then skip that (slashdot)
if ( s[0]=='/' && s[1]=='/' ) i = 2;
}
// . now &s[i] should point to the m_host name
// . chars allowed in hostname = period,alnum,hyphen,underscore
// . stops at '/' or ':' or any other disallowed character
j = i;
while (s[j] && (is_alnum_a(s[j]) || s[j]=='.' || s[j]=='-'||s[j]=='_'))
j++;
// copy m_host into "s" (make it lower case, too)
to_lower3_a ( s + i, j - i, m_url + m_ulen );
m_host = m_url + m_ulen;
m_hlen = j - i;
// common mistake: if hostname ends in a . then back up
while ( m_hlen > 0 && m_host[m_hlen-1]=='.' ) m_hlen--;
// NULL terminate for strchr()
m_host [ m_hlen ] = '\0';
// advance m_ulen to end of hostname
m_ulen += m_hlen;
// . Test if hostname is in a.b.c.d format
// . this returns 0 if not a valid ip string
int32_t ip = atoip ( m_host , m_hlen );
// advance i to the : for the port, if it exists
i = j;
// NULL terminate m_host for getTLD(), getDomain() and strchr() below
m_host [ m_hlen ] = '\0';
// use ip as domain if we're just an ip address like 192.0.2.1
if ( ip ) {
// ip address has no tld, or mid domain
m_tld = NULL;
m_tldLen = 0;
// but it does have a domain (1.2.3)
m_domain = getDomainOfIp ( m_host , m_hlen , &m_dlen );
// just use the domain as the mid domain for ip-based urls
m_mdlen = m_dlen;
}
// . otherwise, get the tld
// . uses thorough list of tlds in Domains.cpp
else if ( ( m_tld = ::getTLD_static ( m_host, m_hlen ) ) && m_tld > m_host ) {
// set m_domain if we had a tld that's not equal to our host
m_tldLen = strlen ( m_tld );
m_domain = ::getDomain ( m_host , m_hlen , m_tld , &m_dlen );
// set the mid domain length (-1 for the '.')
m_mdlen = m_dlen - m_tldLen - 1;
}
// otherwise, we're no ip and we have no valid domain
else {
m_domain = NULL;
m_dlen = 0;
m_tldLen = 0;
m_mdlen = 0;
}
// . if domain same as host then we might insert a "www." server name
// . however, must have a period in domain name
// . otherwise a domain name of "xxx" would become "www.xxx" and if
// Url::set() is called on that it would be "www.www.xxx" (bad bad)
// . let's only add "www." if there's only 1 period, ok?
if ( ! ip && addWWW && m_host == m_domain && strchr(m_host,'.') ) {
memmove ( m_host + 4 , m_host , m_hlen );
gbmemcpy ( m_host , "www." , 4 );
if ( m_domain ) m_domain += 4;
if ( m_tld ) m_tld += 4;
m_ulen += 4;
m_hlen += 4;
}
// set the default port based on the protocol
m_defPort = 80;
if ( m_slen==5 && strncmp(m_scheme, "https",5)==0 ) m_defPort = 443;
// assume we're using the default port for this scheme/protocol
m_port = m_defPort;
// see if a port was provided in the hostname after a colon
if ( s[i] == ':' ) {
// remember the ptr so far
int32_t savedLen = m_ulen;
// add a colon to our m_url
m_url [ m_ulen++ ] = ':';
// scan for a '/'
j = i + 1;
while ( s[j] && s[j]!='/') m_url[m_ulen++] = s[j++];
m_portPtr = s + i + 1;
m_portPtrLen = j - (i + 1);
// now read our port
m_port = atol2(m_portPtr, m_portPtrLen);
// if it's the default port, then remove what we copied
if ( m_port == m_defPort ) m_ulen = savedLen;
// make i point to the root / in the m_path, if any
i = j;
}
// how many chars is taken up by a specified port?
m_portLen = 0;
if ( m_port != m_defPort ) {
m_portLen += 2; // :3
if ( m_port >= 10 ) m_portLen += 1;
if ( m_port >= 100 ) m_portLen += 1;
if ( m_port >= 1000 ) m_portLen += 1;
if ( m_port >= 10000 ) m_portLen += 1;
}
// append a '/' to m_url then bail if there is no m_path after the port
if ( s[i] != '/') {
m_path = m_url + m_ulen;
m_path[0] = '/';
m_plen = 1;
m_url[ ++m_ulen ]='\0';
return;
}
// . get the m_path and m_path length
// . j,i should point to start of path slash '/'
// . scan so it points to end or a ? or #
j = i;
// now we include # as part of the path if it is a hash bang '#!'
// which was the web-breaking google hack that is now deprecated
while ( s[j] && s[j]!='?' ) {
if ( s[j] == '#' && s[j+1] != '!' )
break;
j++;
}
// point the path inside m_url even though we haven't written it yet
m_path = m_url + m_ulen;
m_plen = m_ulen;
// . deal with wierd things in the path
// . i points to start of path (should be /)
for (; i < j ; i++ ) {
// dedup double backslashes
// ensure m_ulen >= m_plen so we don't hurt "http:///" ...
// but people sometimes put http:// in the *path*
if ( s[i] == '/' && m_url[m_ulen-1] == '/' &&
m_ulen-1 >= m_plen &&
m_ulen >= 2 && m_url[m_ulen-2] != ':' ) continue;
// handled by UrlParser for version 123 and above
if (titledbVersion <= 122) {
// deal with current directories in the m_path
if ( s[i] == '.' && m_url[m_ulen-1] == '/' &&
(i+1 == j || s[i+1]=='/')) continue;
// . deal with damned ..'s in the m_path
// . if next 2 chars are .'s and last char we wrote was '/'
if ( s[i] == '.' && s[i+1]=='.' && (s[i+2] == '/' || s[i+2] == '\0') && m_url[m_ulen-1] == '/' ) {
// dont back up over first / in path
if ( m_url + m_ulen - 1 > m_path ) m_ulen--;
while ( m_url[m_ulen-1] != '/' ) m_ulen--;
// skip i to next / after these 2 dots
while ( s[i] && s[i]!='/' ) i++;
continue;
}
}
// don't allow ; before the ?...probably because of stripped
// sessionId...
// I was going to add other possible dup separators, but now
// it seems as though it might cause problems
if (s[i] == ';' && s[i+1] == '?') continue;
// store char and advance to next
m_url[m_ulen++] = s[i];
}
// reset the path length in case we had to remove some wierd stuff
m_plen = m_ulen - m_plen;
// . get the m_query
// . the query is anything after the path that starts with ?
// . NOTE: we ignore strings beginning with '#' (page relative anchors)
if ( i < len && s[i] != '#' ) {
//remove back to back &'s in the cgi query
//http://www.nyasatimes.com/national/politics/160.html?print&&&
char *kstart = s + i;
char *kend = s + i + (len - i);
char *dst = m_url + m_ulen;
for ( char *k = kstart ; k < kend ; k++ ) {
// skip & if we just did one
if ( *k == '&' && k > kstart && *(k-1)=='&' ) continue;
// copy over one char at a time
*dst++ = *k;
}
// point after the '?' i guess
m_query = m_url + m_ulen + 1;
m_qlen = dst - m_query;
m_ulen += m_qlen + 1;
}
// get the m_filename from the m_path (m_flen might be 0)
m_flen = 0;
while (m_path[m_plen-1-m_flen]!='/' && m_flen<m_plen) m_flen++;
m_filename = m_path + m_plen - m_flen;
// get the m_extension from the m_path
m_elen = 0;
while (is_alnum_a(m_path[m_plen-1-m_elen]) && m_elen < m_plen)m_elen++;
if ( m_path[ m_plen-1-m_elen] != '.' ) m_elen = 0; // no m_extension
m_extension = m_path + m_plen - m_elen;
// null terminate our s
m_url[ m_ulen ]='\0';
}