1
- use std:: fmt:: format;
2
- use std:: hint:: black_box;
3
- use std:: io:: { BufRead , Read , Write } ;
4
- use std:: { mem, process, ptr} ;
1
+ use std:: io:: { Read , Write } ;
5
2
use std:: time:: Instant ;
6
3
use mork_bytestring:: { byte_item, Expr , ExprZipper , ExtractFailure , item_byte, Tag } ;
7
4
use mork_bytestring:: Tag :: { Arity , SymbolSize } ;
8
- use mork_frontend:: bytestring_parser:: { Parser , ParserError , BufferedIterator } ;
5
+ use mork_frontend:: { bytestring_parser:: { Parser , ParserError , /* BufferedIterator */ } , cz3_parser :: BufferedIterator } ;
9
6
use pathmap:: trie_map:: BytesTrieMap ;
10
7
use pathmap:: zipper:: { ReadZipper , WriteZipper , Zipper } ;
11
8
12
9
10
+ pub ( crate ) mod symbol_mapping;
11
+ pub use symbol_mapping:: SymbolMapping ;
12
+
13
13
#[ repr( transparent) ]
14
14
pub struct Space { pub ( crate ) btm : BytesTrieMap < ( ) > }
15
15
@@ -275,64 +275,6 @@ fn indiscriminate_bidirectional_matching_stack(ez: &mut ExprZipper) -> Vec<u8> {
275
275
}
276
276
}
277
277
278
- pub struct SymbolMapping {
279
- count : u64 ,
280
- symbols : BytesTrieMap < Vec < u8 > > ,
281
- strings : BytesTrieMap < String > ,
282
- }
283
-
284
- impl SymbolMapping {
285
- pub fn new ( ) -> Self {
286
- Self {
287
- count : 3 ,
288
- symbols : BytesTrieMap :: new ( ) ,
289
- strings : BytesTrieMap :: new ( ) ,
290
- }
291
- }
292
-
293
- // temporary workaround for the inability of making BytesTrieMaps static
294
- pub fn as_static_mut ( & mut self ) -> & ' static mut SymbolMapping {
295
- unsafe { mem:: transmute :: < & mut SymbolMapping , & ' static mut SymbolMapping > ( self ) }
296
- }
297
-
298
- pub fn as_static ( & self ) -> & ' static SymbolMapping {
299
- unsafe { mem:: transmute :: < & SymbolMapping , & ' static SymbolMapping > ( & self ) }
300
- }
301
- }
302
-
303
- fn gen_key < ' a > ( i : u64 , buffer : * mut u8 ) -> & ' a [ u8 ] {
304
- let ir = u64:: from_be ( i) ;
305
- unsafe { ptr:: write_unaligned ( buffer as * mut u64 , ir) } ;
306
- let bs = ( 8 - ir. trailing_zeros ( ) /8 ) as usize ;
307
- let l = bs. max ( 1 ) ;
308
- unsafe { std:: slice:: from_raw_parts ( buffer. byte_offset ( ( 8 - l) as isize ) , l) }
309
- }
310
-
311
- impl Parser for SymbolMapping {
312
- fn tokenizer ( & mut self , s : String ) -> Vec < u8 > {
313
- if s. len ( ) == 0 { return vec ! [ ] }
314
- // return s.as_bytes().to_vec();
315
- let mut z = self . symbols . write_zipper_at_path ( s. as_bytes ( ) ) ;
316
- if let Some ( r) = z. get_value ( ) {
317
- r. clone ( )
318
- } else {
319
- self . count += 1 ;
320
- let mut buf: [ u8 ; 8 ] = [ 0 ; 8 ] ;
321
- let slice = gen_key ( self . count , buf. as_mut_ptr ( ) ) ;
322
- let internal = slice. to_vec ( ) ;
323
- z. set_value ( internal. clone ( ) ) ;
324
- drop ( z) ;
325
- self . strings . insert ( slice, s) ;
326
- internal
327
- }
328
- }
329
- }
330
-
331
- impl SymbolMapping {
332
- pub fn token_lookup ( & self , token : & [ u8 ] ) -> Option < & String > {
333
- self . strings . get ( token)
334
- }
335
- }
336
278
337
279
338
280
impl Space {
@@ -344,6 +286,7 @@ impl Space {
344
286
unsafe { ( & self . btm as * const BytesTrieMap < ( ) > ) . cast_mut ( ) . as_mut ( ) . unwrap ( ) . write_zipper ( ) }
345
287
}
346
288
289
+
347
290
pub fn load_csv < R : Read > ( & mut self , mut r : R , sm : & mut SymbolMapping ) -> Result < usize , String > {
348
291
let mut i = 0 ;
349
292
let mut buf = vec ! [ ] ;
@@ -358,7 +301,7 @@ impl Space {
358
301
let mut ez = ExprZipper :: new ( e) ;
359
302
ez. loc += 1 ;
360
303
for symbol in sv. split ( |& x| x == b',' ) {
361
- let internal = sm. tokenizer ( unsafe { String :: from_utf8_unchecked ( symbol. to_vec ( ) ) } ) ;
304
+ let internal = sm. tokenizer ( symbol) ;
362
305
ez. write_symbol ( & internal[ ..] ) ;
363
306
ez. loc += internal. len ( ) + 1 ;
364
307
a += 1 ;
@@ -376,39 +319,71 @@ impl Space {
376
319
Ok ( i)
377
320
}
378
321
379
- pub fn load_json < R : Read > ( & mut self , mut r : R , sm : & ' static mut SymbolMapping ) -> Result < usize , String > {
380
- pub struct SpaceTranscriber < ' a , ' b , ' c > { count : usize , wz : & ' c mut WriteZipper < ' a , ' b , ( ) > , sm : & ' static mut SymbolMapping }
381
- impl < ' a , ' b , ' c > SpaceTranscriber < ' a , ' b , ' c > {
322
+ // pub fn load_csv<R : Read>(&mut self, mut r: R, sm: &mut SymbolMapping) -> Result<usize, String> {
323
+ // let mut i = 0;
324
+ // let mut buf = vec![];
325
+ // let mut stack = [0u8; 2048];
326
+
327
+ // match r.read_to_end(&mut buf) {
328
+ // Ok(read) => {
329
+ // for sv in buf.split(|&x| x == b'\n') {
330
+ // if sv.len() == 0 { continue }
331
+ // let mut a = 0;
332
+ // let e = Expr{ ptr: stack.as_mut_ptr() };
333
+ // let mut ez = ExprZipper::new(e);
334
+ // ez.loc += 1;
335
+ // for symbol in sv.split(|&x| x == b',') {
336
+ // let internal = sm.tokenizer(unsafe { String::from_utf8_unchecked(symbol.to_vec()) });
337
+ // ez.write_symbol(&internal[..]);
338
+ // ez.loc += internal.len() + 1;
339
+ // a += 1;
340
+ // }
341
+ // let total = ez.loc;
342
+ // ez.reset();
343
+ // ez.write_arity(a);
344
+ // self.btm.insert(&stack[..total], ());
345
+ // i += 1;
346
+ // }
347
+ // }
348
+ // Err(e) => { return Err(format!("{:?}", e)) }
349
+ // }
350
+
351
+ // Ok(i)
352
+ // }
353
+ pub fn load_json < R : Read > ( & mut self , mut r : R , sm : & mut SymbolMapping ) -> Result < usize , String > {
354
+ pub struct SpaceTranscriber < ' a , ' b , ' c , ' sm > { count : usize , wz : & ' c mut WriteZipper < ' a , ' b , ( ) > , sm : & ' sm mut SymbolMapping }
355
+ impl < ' a , ' b , ' c , ' sm > SpaceTranscriber < ' a , ' b , ' c , ' sm > {
382
356
#[ inline( always) ] fn write < S : Into < String > > ( & mut self , s : S ) {
383
- let token = self . sm . tokenizer ( s. into ( ) ) ;
357
+ let s_ : String = s. into ( ) ;
358
+ let token = self . sm . tokenizer ( s_. as_bytes ( ) ) ;
384
359
let mut path = vec ! [ item_byte( Tag :: SymbolSize ( token. len( ) as u8 ) ) ] ;
385
360
path. extend ( token) ;
386
361
self . wz . descend_to ( & path[ ..] ) ;
387
362
self . wz . set_value ( ( ) ) ;
388
363
self . wz . ascend ( path. len ( ) ) ;
389
364
}
390
365
}
391
- impl < ' a , ' b , ' c > crate :: json_parser:: Transcriber for SpaceTranscriber < ' a , ' b , ' c > {
366
+ impl < ' a , ' b , ' c , ' sm > crate :: json_parser:: Transcriber for SpaceTranscriber < ' a , ' b , ' c , ' sm > {
392
367
#[ inline( always) ] fn descend_index ( & mut self , i : usize , first : bool ) -> ( ) {
393
368
if first { self . wz . descend_to ( & [ item_byte ( Tag :: Arity ( 2 ) ) ] ) ; }
394
- let token = self . sm . tokenizer ( i. to_string ( ) ) ;
369
+ let token = self . sm . tokenizer ( i. to_string ( ) . as_bytes ( ) ) ;
395
370
self . wz . descend_to ( & [ item_byte ( Tag :: SymbolSize ( token. len ( ) as u8 ) ) ] ) ;
396
371
self . wz . descend_to ( token) ;
397
372
}
398
373
#[ inline( always) ] fn ascend_index ( & mut self , i : usize , last : bool ) -> ( ) {
399
- self . wz . ascend ( self . sm . tokenizer ( i. to_string ( ) ) . len ( ) + 1 ) ;
374
+ self . wz . ascend ( self . sm . tokenizer ( i. to_string ( ) . as_bytes ( ) ) . len ( ) + 1 ) ;
400
375
if last { self . wz . ascend ( 1 ) ; }
401
376
}
402
377
#[ inline( always) ] fn write_empty_array ( & mut self ) -> ( ) { self . write ( "[]" ) ; self . count += 1 ; }
403
378
#[ inline( always) ] fn descend_key ( & mut self , k : & str , first : bool ) -> ( ) {
404
379
if first { self . wz . descend_to ( & [ item_byte ( Tag :: Arity ( 2 ) ) ] ) ; }
405
- let token = self . sm . tokenizer ( k. to_string ( ) ) ;
380
+ let token = self . sm . tokenizer ( k. to_string ( ) . as_bytes ( ) ) ;
406
381
// let token = k.to_string();
407
382
self . wz . descend_to ( & [ item_byte ( Tag :: SymbolSize ( token. len ( ) as u8 ) ) ] ) ;
408
383
self . wz . descend_to ( token) ;
409
384
}
410
385
#[ inline( always) ] fn ascend_key ( & mut self , k : & str , last : bool ) -> ( ) {
411
- let token = self . sm . tokenizer ( k. to_string ( ) ) ;
386
+ let token = self . sm . tokenizer ( k. to_string ( ) . as_bytes ( ) ) ;
412
387
// let token = k.to_string();
413
388
self . wz . ascend ( token. len ( ) + 1 ) ;
414
389
if last { self . wz . ascend ( 1 ) ; }
@@ -442,30 +417,35 @@ impl Space {
442
417
Ok ( st. count )
443
418
}
444
419
445
- pub fn load < R : Read > ( & mut self , r : R , sm : & mut SymbolMapping ) -> Result < usize , String > {
446
- let mut it = BufferedIterator :: new ( r) ;
447
420
448
- let t0 = Instant :: now ( ) ;
449
- let mut i = 0 ;
450
- let mut stack = [ 0u8 ; 2048 ] ;
451
- let mut vs = Vec :: with_capacity ( 64 ) ;
452
- loop {
453
- let mut ez = ExprZipper :: new ( Expr { ptr : stack. as_mut_ptr ( ) } ) ;
454
- match sm. sexprUnsafe :: < R > ( & mut it, & mut vs, & mut ez) {
455
- Ok ( ( ) ) => {
456
- self . btm . insert ( & stack[ ..ez. loc ] , ( ) ) ;
457
- }
458
- Err ( ParserError :: InputFinished ( ) ) => { break }
459
- Err ( other) => { return Err ( format ! ( "{:?}" , other) ) }
460
- }
461
- i += 1 ;
462
- vs. clear ( ) ;
463
- }
464
- println ! ( "loading took {} ms" , t0. elapsed( ) . as_millis( ) ) ;
465
- Ok ( i)
421
+ // // TODO integrate with new code?
422
+ pub fn load < R : Read > ( & mut self , r : R , sm : & mut SymbolMapping ) -> Result < usize , String > {
423
+ #![ allow( unused) ]
424
+ core:: todo!( "Figure out what version of the parser this expects" ) ;
425
+ // let mut it = BufferedIterator::new(r);
426
+
427
+ // let t0 = Instant::now();
428
+ // let mut i = 0;
429
+ // let mut stack = [0u8; 2048];
430
+ // let mut vs = Vec::with_capacity(64);
431
+ // loop {
432
+ // let mut ez = ExprZipper::new(Expr{ptr: stack.as_mut_ptr()});
433
+ // match sm.sexprUnsafe::<R>(&mut it, &mut vs, &mut ez) {
434
+ // Ok(()) => {
435
+ // self.btm.insert(&stack[..ez.loc], ());
436
+ // }
437
+ // Err(ParserError::InputFinished) => { break }
438
+ // Err(other) => { return Err(format!("{:?}", other)) }
439
+ // }
440
+ // i += 1;
441
+ // vs.clear();
442
+ // }
443
+ // println!("loading took {} ms", t0.elapsed().as_millis());
444
+ // Ok(i)
466
445
}
467
446
468
- pub fn dump < W : Write > ( & self , w : & mut W , sm : & ' static SymbolMapping ) -> Result < usize , String > {
447
+
448
+ pub fn dump < W : Write > ( & self , w : & mut W , sm : & SymbolMapping ) -> Result < usize , String > {
469
449
let mut rz = self . btm . read_zipper ( ) ;
470
450
471
451
let t0 = Instant :: now ( ) ;
@@ -537,6 +517,7 @@ impl Space {
537
517
} ) ;
538
518
}
539
519
520
+ #[ cfg( feature = "pathmap_counters" ) ]
540
521
pub fn done ( & mut self , symbol_mapping : SymbolMapping ) -> ! {
541
522
let counters = pathmap:: counters:: Counters :: count_ocupancy ( & self . btm ) ;
542
523
counters. print_histogram_by_depth ( ) ;
0 commit comments