forked from fanfank/timecat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
timecat
executable file
·1175 lines (1000 loc) · 40.4 KB
/
timecat
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python
# -*- coding: utf8 -*-
"""
@author xuruiqi([email protected])
@site https://github.com/fanfank/timecat
@date 20160106
@desc timecat is a command line tool for
saving disk I/O when you have to output
specific logs in between a time span.
It uses binary search to directly locate
the start position and end position, then
output content between the target positions.
"""
__version__ = "2.0.0"
import argparse
import copy
import datetime
import os
import re
import time
import traceback
import sys
SLEEP_DURATION = 0.001 # 1ms
MAX_LINE_LENGTH = 1024 * 1024 * 128 # 128MB
enable_color = False
enable_verbose = False
scan_line_num = 0
binary_loop_num = 0
DATETIME_FORMAT_LIST = [
# sequence is important
# tag the format with YEAR, MONTH, DAY ...
{
"desc": "e.g. 2016-01-02 20:13:14.666",
"regex": "(?P<YEAR>\d{4})\D(?P<MONTH>\d{2})\D(?P<DAY>\d{2})\D(?P<HOUR>\d{2})\D(?P<MINUTE>\d{2})\D(?P<SECOND>\d{2})\.(?P<MICROSECOND>\d{1,3})",
"direct_compare": True,
},
{
"desc": "e.g. 2016-01-02 20:13:14",
"regex": "(?P<YEAR>\d{4})\D(?P<MONTH>\d{2})\D(?P<DAY>\d{2})\D(?P<HOUR>\d{2})\D(?P<MINUTE>\d{2})\D(?P<SECOND>\d{2})",
"direct_compare": True,
},
{
"desc" : "syslog. e.g. Jan 2 20:13:14",
"regex": "(?P<MONTH>[a-zA-Z]{3})\D(?P<DAY>[ \d]{1}\d{1})\D(?P<HOUR>\d{2})\D(?P<MINUTE>\d{2})\D(?P<SECOND>\d{2})",
"direct_compare": False,
},
{
"desc" : "e.g. 2016/Jan/02 20:13:14.666",
"regex": "(?P<DAY>\d{2})\D(?P<MONTH>[a-zA-Z]{3})\D(?P<YEAR>\d{4})\D(?P<HOUR>\d{2})\D(?P<MINUTE>\d{2})\D(?P<SECOND>\d{2})\.(?P<MICROSECOND>\d{1,3})",
"direct_compare": False,
},
{
"desc" : "e.g. 02-Jan-2016 20:13:14",
"regex": "(?P<DAY>\d{2})\D(?P<MONTH>[a-zA-Z]{3})\D(?P<YEAR>\d{4})\D(?P<HOUR>\d{2})\D(?P<MINUTE>\d{2})\D(?P<SECOND>\d{2})",
"direct_compare": False,
},
{
"desc" : "datetime without seperator. e.g. 20160102201314",
#"regex": "\d{14}",
"regex": "(?P<YEAR>\d{4})(?P<MONTH>\d{2})(?P<DAY>\d{2})(?P<HOUR>\d{2})(?P<MINUTE>\d{2})(?P<SECOND>\d{2})",
"direct_compare": True,
},
{
"desc" : "timestamp in microseconds",
"regex": "(?P<MICROTS>\d{13})",
"direct_compare": True,
},
{
"desc" : "timestamp in seconds",
"regex": "(?P<TS>\d{10})",
"direct_compare": True,
},
{
"desc" : "only hour, minute and seconds",
"regex": "(?P<HOUR>\d{2}):(?P<MINUTE>\d{2}):(?P<SECOND>\d{2})",
"direct_compare": True,
},
{
"desc" : "only hour and minutes",
"regex": "(?P<HOUR>\d{2}):(?P<MINUTE>\d{2})",
"direct_compare": True,
},
]
MONTH_DICT = {
"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05",
"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10",
"Nov": "11", "Dec": "12",
"01" : "01", "02" : "02", "03" : "03", "04" : "04", "05" : "05",
"06" : "06", "07" : "07", "08" : "08", "09" : "09", "10" : "10",
"11" : "11", "12" : "12",
"1" : "01", "2" : "02", "3" : "03", "4" : "04", "5" : "05",
"6" : "06", "7" : "07", "8" : "08", "9" : "09",
" 1" : "01", " 2" : "02", " 3" : "03", " 4" : "04", " 5" : "05",
" 6" : "06", " 7" : "07", " 8" : "08", " 9" : "09",
}
DAY_DICT = {
"01": "01", "02": "02", "03": "03", "04": "04", "05": "05",
"06": "06", "07": "07", "08": "08", "09": "09",
" 1": "01", " 2": "02", " 3": "03", " 4": "04", " 5": "05",
" 6": "06", " 7": "07", " 8": "08", " 9": "09",
"10": "10", "11": "11", "12": "12", "13": "13", "14": "14",
"15": "15", "16": "16", "17": "17", "18": "18", "19": "19",
"20": "20", "21": "21", "22": "22", "23": "23", "24": "24",
"25": "25", "26": "26", "27": "27", "28": "28", "29": "29",
"30": "30", "31": "31",
}
CMP_OP_DICT = {
">" : lambda x, y: x > y,
"<" : lambda x, y: x < y,
"=" : lambda x, y: x == y,
"==": lambda x, y: x == y,
">=": lambda x, y: x >= y,
"<=": lambda x, y: x <= y,
"!=": lambda x, y: x != y,
}
def color(content, cl = "green"):
if not enable_color:
return content
if cl == "green":
return "\x1B[0;32;40m{}\x1B[0m".format(content)
elif cl == "red":
return "\x1B[0;31;40m{}\x1B[0m".format(content)
else:
return content
def ts2dt(ts, dt_format="%Y-%m-%d %H:%M:%S"):
return datetime.datetime.fromtimestamp(ts).strftime(dt_format)
def dt2ts(dt, dt_format="%Y-%m-%d %H:%M:%S"):
return time.mktime(time.strptime(dt, dt_format))
def init_parser(target_parser):
target_parser.add_argument(
"-s", "--start-datetime", dest = "start",
required = True,
help = "Which datetime to start(includsive). " \
"e.g. " \
"\"-s '2016-01-02 20:13:14'\", " \
"\"-s '2016/01/02 20:13:14'\", " \
"\"-s '2016-01-02T12:13:14'\", " \
"\"-s '2016-01-02T12:13:14.000'\", " \
"\"-s '02/Jan/2016:20:13:14'\", " \
"\"-s '02-Jan-2016 20:13:14'\", " \
"\"-s '02 Jan 2016 20:13:14'\", " \
"\"-s 'Jan 2 20:13:14'\", " \
"\"-s '20160102201315'\", " \
"\"-s '1451736794'\", " \
"\"-s '20:13'\", etc. We will exhaust " \
"our effort to cover regular datetime formats. " \
"The format of -s and -e params do not need to " \
"be consistent with the datetime format in the file.")
target_parser.add_argument(
"-e", "--end-datetime", dest = "end",
default = None,
help = "Stop after reaching this datetime(excludsive). " \
"Same format as \"-s\". If not set, means output " \
"till the end of file.")
target_parser.add_argument(
"-d", "--date", dest = "date",
default = None,
help = "This is an optional argument. With \"-d\", the " \
"following two statements are essentially the same: " \
"\"timecat -s '2016-01-02 20:13:14' -e '2016-01-02 20:14:13' ...\" " \
"and \"timecat -d '2016-01-02' -s '20:13:14' -e '20:14:13' ...\"." )
#target_parser.add_argument(
# #DEPRECATED
# "-r", "--regex-format", dest = "regex_format",
# default = None,
# help = "If timecat failes to detect datetime format in " \
# "your log file, you can specify the regex pattern " \
# "that can find your datetime within each log line. " \
# "e.g. I have format \"2016:01:01-20-13-14\", and " \
# "timecat does not recognize this datetime format, " \
# "then I can specify " \
# "\"-r '\d{4}:\d{2}:\d{2}-\d{2}-\d{2}-\d{2}'\".")
target_parser.add_argument(
"--lpms", dest="lpms",
type=int, default=-1,
help="Reading speed control, "\
"after how many lines read should the program " \
"sleep for 1ms, default to -1, meaning no limit.")
target_parser.add_argument(
"-v", "--verbose", dest = "enable_verbose",
action = "store_true",
default = False,
help = "print additional information")
target_parser.add_argument(
"--color", dest = "enable_color",
action = "store_true", default = False,
help = "Whether to enable colorized output")
target_parser.add_argument(
"file", nargs = "+",
help = "files to be timecat.")
def dtcmp(lhs, rhs, format_info, cmp_op):
"""compare datetime
Params:
lhs # left hand side
rhs # right hand side
format_info = {
"regex" # the regular expression
"parser" # the compiled regular expression
"direct_compare" # indicates whether lhs and
# rhs can be compared directly
}
cmp_op # ">", "<", "==", ">=", "<=" or "!="
Authors: xuruiqi
"""
global MONTH_DICT
global CMP_OP_DICT
cmpf = CMP_OP_DICT[cmp_op]
if cmp_op in ["=", "!="] or format_info["direct_compare"]:
return cmpf(lhs, rhs)
else:
lhs_match = format_info["parser"].search(lhs)
rhs_match = format_info["parser"].search(rhs)
lhs_groupdict = lhs_match.groupdict()
rhs_groupdict = rhs_match.groupdict()
dtcomponent_list = ["YEAR", "MONTH", "DAY", "HOUR",
"MINUTE", "SECOND", "MILISECOND", "MICROSECOND"]
for dtcomponent in dtcomponent_list:
lhs_component = lhs_groupdict.get(dtcomponent, None)
rhs_component = rhs_groupdict.get(dtcomponent, None)
if dtcomponent == "MONTH" and lhs_component:
lhs_component = MONTH_DICT[lhs_component]
rhs_component = MONTH_DICT[rhs_component]
if lhs_component != rhs_component:
return cmpf(lhs_component, rhs_component)
if cmp_op in [">", "<"]:
return False
else:
return True
def get_bi_cmp_func(file_format_info, param_format_info):
"""return compare function(s) according to file_format_info
Authors: xuruiqi
"""
def ge(lhs, rhs):
return dtcmp(lhs, rhs, file_format_info, ">=")
def le(lhs, rhs):
return dtcmp(lhs, rhs, file_format_info, "<=")
if file_format_info["order"] == "ascending":
return ge
else:
return le
def get_bi_cmp_func2(file_format_info, cmp_pattern_format_info, cmp_pattern):
"""a refined function of get_bi_cmp_func since timecat 2.x
Authors: xuruiqi
"""
global DAY_DICT
global MONTH_DICT
# unify the datetime formats from file and user input
file_sample_match = file_format_info["parser"].search(
file_format_info["sample"])
cmp_pattern_match = cmp_pattern_format_info["parser"].search(
cmp_pattern)
def year_adapt_func(match):
#TODO(xuruiqi) have to consider global time
mgd = match.groupdict()
return mgd.get("YEAR", "1970") + \
str(MONTH_DICT[mgd.get("MONTH", "01")]) + \
str(DAY_DICT[mgd.get("DAY", "01")]) + \
mgd.get("HOUR", "00") + \
mgd.get("MINUTE", "00") + \
mgd.get("SECOND", "00") + \
"%03d" % int(mgd.get("MICROSECOND", "000"))
def month_adapt_func(match):
mgd = match.groupdict()
return str(MONTH_DICT[mgd.get("MONTH", "01")]) + \
str(DAY_DICT[mgd.get("DAY", "01")]) + \
mgd.get("HOUR", "00") + \
mgd.get("MINUTE", "00") + \
mgd.get("SECOND", "00") + \
"%03d" % int(mgd.get("MICROSECOND", "000"))
def hour_adapt_func(match):
mgd = match.groupdict()
return mgd.get("HOUR", "00") + \
mgd.get("MINUTE", "00") + \
mgd.get("SECOND", "00") + \
"%03d" % int(mgd.get("MICROSECOND", "000"))
adapt_func_dict = {
"MICROTS": lambda x: x,
"TS" : lambda x: x + "000",
"YEAR" : year_adapt_func,
"MONTH" : month_adapt_func,
"HOUR" : hour_adapt_func,
}
pattern_type_conversion_dict = {
"MICROTS": "MICROTS",
"TS" : "MICROTS",
"YEAR" : "YEAR",
"MONTH" : "MONTH",
"HOUR" : "HOUR",
}
fsm_groupdict = file_sample_match.groupdict()
cpm_groupdict = cmp_pattern_match.groupdict()
final_pattern_type = None
file_pattern_adapt_func = None
for key in ["MICROTS", "TS", "YEAR", "MONTH", "HOUR"]:
if fsm_groupdict.get(key, None):
final_pattern_type = pattern_type_conversion_dict[key]
file_pattern_adapt_func = adapt_func_dict[key]
break
if final_pattern_type is None:
raise Exception("Invalid datetime format in file")
# Convert cmp_pattern to final pattern format
# Firstly, convert to MICROTS type
if cpm_groupdict.get("MICROTS", None):
pass
elif cpm_groupdict.get("TS", None):
cmp_pattern = cmp_pattern + "000"
elif cpm_groupdict.get("YEAR", None):
year_pattern = adapt_func_dict["YEAR"](cmp_pattern_match)
cmp_pattern = str(dt2ts(year_pattern[:-3], "%Y%m%d%H%M%S")) + \
year_pattern[-3:]
elif cpm_groupdict.get("MONTH", None):
# this situation is a little bit compilicated
# When file is in a TS/MICROTS/YEAR format
# we have to guess the year of the cmp pattern
if final_pattern_type == "MONTH" or final_pattern_type == "HOUR":
month_pattern = adapt_func_dict["YEAR"](cmp_pattern_match)
cmp_pattern = str(dt2ts(year_pattern[:-3], "%Y%m%d%H%M%S")) + \
year_pattern[-3:]
elif final_pattern_type == "YEAR" or final_pattern_type == "MICROTS":
month_pattern = adapt_func_dict["MONTH"](cmp_pattern_match)
# extract year
if final_pattern_type == "YEAR":
year = file_pattern_adapt_func(file_sample_match)[0:4]
elif final_pattern_type == "MICROTS":
year_month_day = \
ts2dt(file_sample_match.group(0)[:10], "%Y")
cmp_pattern = \
str( \
dt2ts(year + month_pattern[:-3], \
"%Y%m%d%H%M%S")) + \
month_pattern[-3:]
elif cpm_groupdict.get("HOUR", None):
# this situation is a little bit compilicated
# When file is in a TS/MICROTS/YEAR format
# we have to guess the year, month and day
# of the cmp pattern
if final_pattern_type == "HOUR":
year_pattern = adapt_func_dict["YEAR"](cmp_pattern_match)
cmp_pattern = str(dt2ts(year_pattern[:-3], "%Y%m%d%H%M%S")) + \
year_pattern[-3:]
elif final_pattern_type == "YEAR" or final_pattern_type == "MICROTS":
hour_pattern = adapt_func_dict["HOUR"](cmp_pattern_match)
# extract year month day
if final_pattern_type == "YEAR":
year_month_day = file_pattern_adapt_func(file_sample_match)[0:8]
elif final_pattern_type == "MICROTS":
year_month_day = \
ts2dt(file_sample_match.group(0)[:10], "%Y%m%d")
cmp_pattern = \
str( \
dt2ts(year_month_day + hour_pattern[:-3], \
"%Y%m%d%H%M%S")) + \
hour_pattern[-3:]
#TODO(xuruiqi) Latter we have to consider global time
# Convert from MICROTS to final_pattern_type
if final_pattern_type == "MICROTS":
# do nothing
pass
else:
dtformat = "%Y%m%d%H%M%S"
if final_pattern_type == "HOUR":
final_pattern_type = "%H%M%S"
elif final_pattern_type == "MONTH":
final_pattern_type = "%m%d%H%M%S"
cmp_pattern = ts2dt(int(cmp_pattern[0:10]), dtformat) + \
cmp_pattern[-3:]
# Finish unifying the datetime format from user input and file
# Return corresponding compare function
def ge(line_match):
adapt_pattern = file_pattern_adapt_func(line_match)
return adapt_pattern >= cmp_pattern
def le(line_match):
adapt_pattern = file_pattern_adapt_func(line_match)
return adapt_pattern <= cmp_pattern
if file_format_info["order"] == "ascending":
return ge
else:
return le
def at_line_head(f):
"""judge if f's reading pointer is at the head of a line
Authors: xuruiqi
"""
if f.tell() == 0:
return True
else:
f.seek(f.tell() - 1)
return f.read(1) == "\n"
def locate_current_line(f, ed, st_bound,
backward_step_hint = 1024 * 4):
"""return the start position of line of ed. this triggers
a backward search of line, and the search postion must
stay in [st_bound, ed], inclusively.
if no corresponding line is found, then st_bound is returned
Authors: xuruiqi
TODO(xuruiqi) set a limit to the max length of a line
"""
global scan_line_num
f.seek(0, os.SEEK_END)
eof_pos = f.tell()
if ed == eof_pos:
return ed
# old_pos means after which(inclusive) all the data has
# been read and no need to read again in this function
old_pos = min(ed + 1, eof_pos)
backward_step = backward_step_hint
while old_pos != st_bound:
new_pos = old_pos - backward_step
if new_pos < st_bound:
new_pos = st_bound
f.seek(new_pos)
# collect all lines in [new_pos, old_pos)
lines = []
while f.tell() < old_pos:
line = f.readline(old_pos - f.tell())
lines.append(line)
scan_line_num += 1
# if only one line(maybe not a complete line) is read
# check if the first character of this "line" is really
# at the head of line
if len(lines) == 1:
f.seek(0 - len(lines[-1]), os.SEEK_CUR)
if at_line_head(f):
return f.tell()
# multi lines read. directly set the position to the
# head of last line
else:
f.seek(0 - len(lines[-1]), os.SEEK_CUR)
return f.tell()
# not done, continue to look for the head of a line
old_pos = new_pos
# no line found, return st_bound directly
f.seek(st_bound)
return st_bound
def locate_next_line(f, st, ed_bound, forward_step = 1024 * 4):
"""in range [st, ed_bound), find the position of the head
of a line, starting from st. If no such position found,
ed_bound is returned
Authors: xuruiqi
TODO(xuruiqi) add a limit to the max length of a line
"""
global scan_line_num
f.seek(st)
# check if st is already at a line head
if not at_line_head(f):
# read until new line character or ed_bound is reached
step = min(forward_step, ed_bound - f.tell())
while f.tell() != ed_bound and f.readline(step):
scan_line_num += 1
if at_line_head(f):
return f.tell()
return f.tell()
def forward_match(f, st, ed, regprog, ed_inclusive = True,
forward_step_hint = 1024 * 4):
"""read until regprog matches a line or excceed ed
if a match is found, then set f's reading pointer to
the corresponding line head, else to ed
return the match obj, the last matched line, and the
position of the head of the matched line
Authors: xuruiqi
Return: match # the matched object
line # the last read line
(may not be a complete line)
f.tell() # the head of the line
"""
global MAX_LINE_LENGTH
global scan_line_num
if ed_inclusive:
f.seek(ed)
f.readline()
ed = f.tell()
scan_line_num += 1
f.seek(st)
match = None
line = None
while f.tell() < ed:
scan_line_num += 1
# try to read a complete line
line = ""
while len(line) < MAX_LINE_LENGTH and f.tell() < ed and \
(len(line) == 0 or line[-1] != "\n"):
line += f.readline(
min(ed - f.tell(), forward_step_hint))
if len(line) > MAX_LINE_LENGTH:
sys.stderr.write(
color(
"line too long, excceeds {} bytes\n".format(
MAX_LINE_LENGTH),
cl = "red"))
return None, None, None
match = regprog.search(line)
if match:
f.seek(0 - len(line), os.SEEK_CUR)
return match, line, f.tell()
f.seek(ed)
return match, line, f.tell()
def backward_match(f, ed, st, regprog, backward_step_hint = 1024 * 4):
"""backward read until regprog matches a line or excceed st
if the matched line is found, then locate f's reading pointer
to the corresponding head of the line, else locate to st
return the matched obj, the last read line and the head
position of the line
NOTE: ed is not read
Authors: xuruiqi
Return: match # the matched object
line # last line read(maybe not a complete line)
f.tell() # the head position of the line
TODO(xuruiqi) add a limit to max length of the line
"""
global scan_line_num
f.seek(ed)
if f.tell() < st:
return None, None, None
match = None
line = None
old_pos = f.tell()
# cache backward read content in case failing to read a whole
# line during a loop round
last_buffer = ""
backward_step = backward_step_hint
while (not match) and (old_pos > st):
new_pos = old_pos - backward_step
if new_pos < st:
new_pos = st
f.seek(new_pos)
lines = []
while f.tell() < old_pos:
scan_line_num += 1
line = f.readline(old_pos - f.tell())
lines.append(line)
f.seek(new_pos)
valid_start_index = 0
if len(lines) == 1:
# when len(lines) == 1, there may be the following
# possibilities:
# 1. no newline character is read
# 1.1. the head of the read data is the head of
# a line
# 1.2. the head of the read data is not the
# head of a line
# 2. newline character is read
# 2.1. the head of the read data is the head of
# a line
# 2.2. the head of the read data is not the
# head of a line
# we may find that 1.1 is the same as 2.1, in this
# case we have to concatenate the read content and
# the last_buffer, then return if the line is valid
#
# And we may also find that 1.2 is the same as 2.2,
# concatenate the read content and the last_buffer,
# then update last_buffer with the concatenated data
if at_line_head(f):
lines[0] = lines[0] + last_buffer
last_buffer = ""
else:
last_buffer = lines[0] + last_buffer
lines = []
else:
# when len(lines) != 1, there may be the following
# possibilities:
# 1. lines[0] is not a complete line
# 2. lines[0] is a complete line
# we can judge by checking if the first character
# of lines[0] is at line head
lines[-1] = lines[-1] + last_buffer
last_buffer = ""
if not at_line_head(f) and new_pos != st:
# lines[0] is not a complete line
# nor does lines[0][0] at position st
last_buffer = lines[0]
valid_start_index = 1
if new_pos == st and len(last_buffer) > 0:
# new_pos == st means the loop will end
# after this round, so we have to handle
# data in last_buffer
lines.append(last_buffer)
total_lines_length = 0
for line in lines:
total_lines_length += len(line)
# handle data from this round
cur_lines_length = 0
for index in reversed(range(valid_start_index, len(lines))):
line = lines[index]
cur_lines_length += len(line)
match = regprog.search(line)
if match:
# locate f's reading pointer
f.seek(
total_lines_length \
- cur_lines_length \
+ new_pos)
return match, line, f.tell()
# update old_pos
old_pos = new_pos
f.seek(st)
return None, line, f.tell()
def binary_seek_pos(f, st, ed, cmp_pattern, param_format_info,
file_format_info):
"""use binary search to find the first line that is bigger/smaller
than the cmp_pattern when file is in ascending/descending order
Authors: xuruiqi
"""
global scan_line_num
global binary_loop_num
# record the valid read pointer range
st_bound = st
ed_bound = ed
# locate st to the start of the next line
# unless st is already at the start of the current
# line
#st = locate_next_line(f, st, ed_bound)
# locate ed to the start of the current line
# unless ed is at eof
ed = locate_current_line(f, ed, st_bound)
# get compare function according to file_format_info
# ae => after or equal to
# when file is in ascending order, ae == ">="
# when file is in descending order, ae == "<="
# NOTE: actually you can comprehense ae in this way:
# it indicates that the left hand side parameter
# stands after or has the same value as the right
# hand side parameter
ae = get_bi_cmp_func2(file_format_info, param_format_info, cmp_pattern)
#ae = get_bi_cmp_func(file_format_info, param_format_info)
# start doing binary search
regprog = file_format_info["parser"]
while st < ed:
binary_loop_num += 1
mid = st + (ed - st) / 2
# read until regprog matches the line
f.seek(mid)
match, line, res_pos = forward_match(f, mid, ed, regprog)
if match:
# modify group(0) compare if match pattern is after
# or equal to the cmp_pattern
if ae(match):
if res_pos == ed:
# in case this causes a dead loop, backward
# search a line and compare
# NOTE: if we do not handle res_pos == ed
# situation, we may encouter a dead loop,
# say only 2 lines left, the 1st line has
# 10 bytes, the 2nd line has 100 bytes,
# then "mid" will always locate within the
# 2nd line, if pattern in the 2nd line
# accidently after or equal to the
# cmp_pattern, a dead loop occurs, because
# "ed" will not change in the next round
match, line, back_res_pos = backward_match(
f, mid, st, regprog)
if not match or back_res_pos == res_pos:
# this means only one line left, and it
# covers positions st and ed, just return
# the res_pos
return res_pos
elif back_res_pos == st:
# this means only two lines left, and
# they cover positions st and ed. just
# compare and decide which to return
if ae(match):
return st
else:
return res_pos
else:
if ae(match):
ed = back_res_pos
else:
st = back_res_pos
else:
ed = res_pos
else:
if res_pos == st:
# this means st and ed must be covered
# by the same line, just return st/res_pos
return st
st = res_pos
else:
# forward search does not find any valid lines
# try backward search
match, line, res_pos = backward_match(f, mid, st, regprog)
if not match:
# the whole file does not contain any valid line
return None
# found one valid line, compare with cmp_pattern
if ae(match):
ed = res_pos
else:
# this line and the lines follow, until ed,
# all locate before the target cmp_pattern,
# thus return ed directly
return ed
return None if st > ed else ed
def detect_datetime_format(pattern, param_format = None):
"""detect datetime format of a pattern
Authors: xuruiqi
Returns:
{
"regex": , # string type of regex format
"direct_compare": , # whether it's able to
# directly compare two time pattern
"parser": , # compiled regex object
"sample": , # the pattern itself
"is_global_time": , # indicate whether it is UTC time
}
"""
global DATETIME_FORMAT_LIST
for datetime_format in DATETIME_FORMAT_LIST:
match = re.search(datetime_format["regex"], pattern)
if match:
datetime_format_info = copy.deepcopy(datetime_format)
datetime_format_info["parser"] = re.compile(
datetime_format["regex"])
if datetime_format.get("direct_compare", None) is None:
datetime_format_info["direct_compare"] = True
datetime_format_info["sample"] = pattern
#datetime_format_info["is_global_time"] = True \
# if pattern.find("T") >= 0 \
# else False
if pattern.find("T") >= 0:
datetime_format_info["is_global_time"] = True
return datetime_format_info
return None
def detect_file_format(f, start, end, param_format_info):
"""detect whether file is arranged in ascending order or
descending order, and detect the datetime format of file
Authors: xuruiqi
Returns:
{
"regex": , # string type of regex format
"direct_compare": , # whether it's able to
# directly compare two time pattern
"parser": , # compiled regex object
"order": , # the value is 'ascending' or 'descending',
indicating the order of the file
"sample": , # the pattern itself
}
"""
global scan_line_num
original_seek_pos = f.tell()
file_format_info = {
"regex" : None,
"direct_compare": None,
"parser": None,
"order" : None,
"sample": None,
}
# sample the file to determine datetime format in file
try:
MAX_READLINE_NUM = 100
COUNT_THRESHOLD = 3
f.seek(0)
current_format_info = None
current_count = 0
for i in xrange(0, MAX_READLINE_NUM):
scan_line_num += 1
line = f.readline()
if not line:
break
tmp_format_info = detect_datetime_format(line)
if tmp_format_info is None:
continue
if current_format_info is None or \
tmp_format_info["regex"] == current_format_info["regex"]:
current_count += 1
if current_format_info is None:
current_format_info = tmp_format_info
else:
current_count -= 1
if current_count <= 0:
current_count = 1
current_format_info = tmp_format_info
if current_count >= COUNT_THRESHOLD:
break
# can not return None, because some file may have very few lines
#if current_count < COUNT_THRESHOLD:
# return None
except Exception as ex:
raise ex
finally:
f.seek(original_seek_pos)
file_format_info["direct_compare"] = current_format_info["direct_compare"]
file_format_info["regex"] = current_format_info["regex"]
file_format_info["parser"] = current_format_info["parser"]
file_format_info["sample"] = current_format_info["sample"]
# if both start and end are provied, we assume the file order
# is the same as start -> end
if end and dtcmp(start, end, param_format_info, "<"):
file_format_info["order"] = "ascending"
return file_format_info
elif end and dtcmp(start, end, param_format_info, ">"):
file_format_info["order"] = "descending"
return file_format_info
elif end and start == end:
return None
# sample the file to determine file order
try:
MAX_READLINE_NUM = 1000
first_datetime = None
second_datetime = None
# read from head
f.seek(0)
for i in xrange(0, MAX_READLINE_NUM):
scan_line_num += 1
line = f.readline()
if not line:
return None
match = file_format_info["parser"].search(line)
if match:
first_datetime = match.group(0)
break
if not first_datetime:
return None
# read from tail
first_pos = f.tell()
f.seek(0, os.SEEK_END)
last_second_pos = f.tell()
for i in xrange(0, MAX_READLINE_NUM):
if last_second_pos <= first_pos:
return None
# locate the begining of the lines reversly
last_second_pos = locate_current_line(
f,
last_second_pos - 1,
first_pos - 1)
f.seek(last_second_pos)
scan_line_num += 1
line = f.readline()
# no need to judge if reaches eof, which is impossible
match = file_format_info["parser"].search(line)
if match:
second_datetime = match.group(0)
break
if not second_datetime:
return None
# compare first_datetime and second_datetime
if dtcmp(first_datetime, second_datetime, file_format_info, "<"):
file_format_info["order"] = "ascending"
return file_format_info
elif dtcmp(first_datetime, second_datetime, file_format_info, ">"):
file_format_info["order"] = "descending"
return file_format_info
else:
return None
except Exception as ex:
raise ex
finally: