-
Notifications
You must be signed in to change notification settings - Fork 2
/
README.R
71 lines (59 loc) · 2.52 KB
/
README.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
suppressMessages(library(tidyverse))
# for reprex
setwd("C:/Users/dreznik/Dropbox/Data Science/R Projects/read_lines_bug")
# Confirm UTF-8 encoding of included zip file (it contains a larger .txt)
fname <- "test_file.zip"
guess_encoding(fname)
#Read all lines in zip (approx. 31.6k)
all_lines <- read_lines(fname)
length(all_lines)
# Notice all lines have "well-formed" terminations with a carriage-return followed by a newline.
{
s <- read_file(fname)
print(s %>% str_count("\\r"))
print(s %>% str_count("\\n"))
print(s %>% str_count("\\r\\n"))
print(s %>% str_count("\\n\\r"))
}
# For this file, the separator is ";" (irrelevant for read_lines()), and there should be 45 per line. However, notice a few lines have a non-standard number of separators (';'), namely: 46 and 48.
all_lines %>%
str_count(";") %>%
table
# List which lines have the wrong number of separators, as this will drive us to the bug.
all_lines %>%
str_count(";") %>%
{which(.!=45)}
# Use `read_lines()` to read only the first lines with the wrong number of separators, shown above to be line # 14579. So "skip" that number of lines minus one.
line1 <- read_lines(fname,
skip=14578,
n_max=1)
# Notice its content not only has a different number of separators, it's a different line altogether! (herein lies the bug)
all_lines[14579] %>% str_count(";")
line1 %>% str_count(";")
all_lines[14579] == line1
# Notice the line retrieved by `read_lines(skip=14578,n_max=1)` above actually appears much later in the file, suggesting the reading under this skip+n_max mode lost "sync".
(all_lines==line1)%>%which
# Notes:
# * this bug is not caused by the .zip (same behavior if starts from an uncompressed file)
# * this is not related to encoding, in fact, I made sure the file inside the zip is UTF-8
# * this issue seems to manifest itself with longer files only. potentially there's a character within this file being interpreted as a carriage return.
# binary search
find_weird_row <- function(n0,n_min,n_max) {
l_skip <- read_lines(fname,skip=n0,n_max=1)
l_all <- all_lines[n0+1]
l_eq <- l_skip==l_all
print(n0)
#browser()
if(n0%in%c(n_min,n_max)) {
print(sprintf("line %d with l_eq=%s",
n0,
l_eq%>%as.character()))
n0
} else {
if (!l_eq)
find_weird_row(as.integer((n0+n_min)/2),n_min,n0)
else
find_weird_row(as.integer((n0+n_max)/2),n0,n_max)
}
}
find_weird_row(15000,1,length(all_lines))