-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalculateDistributions.R
125 lines (92 loc) · 5.09 KB
/
calculateDistributions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# script which calcualtes distributions from the govdocs dataset
library(ggplot2)
source('utils.R')
tsvPath <- "/Users/kresimir/Dropbox/Work/Projects/BenchmarkDP/publications/INFSOF/experiments/real world dataset/metadata.tsv"
distFolder <- "/Users/kresimir/Dropbox/Work/Projects/BenchmarkDP/publications/INFSOF/experiments/real world dataset/distributions/"
samplesPath <- paste(distFolder, "samples.tsv", sep="")
fileMetadata <- read.table(tsvPath, header=TRUE, sep="\t", stringsAsFactors = FALSE)
fileMetadata$numPage <- as.numeric(fileMetadata$numPage)
fileMetadata$numParag <- as.numeric(fileMetadata$numParag)
fileMetadata$numTables <- as.numeric(fileMetadata$numTables)
fileMetadata$numWords <- as.numeric(fileMetadata$numWords)
fileMetadata$numWordTable <- as.numeric(fileMetadata$numWordTable)
fileMetadata$numParagTable <- as.numeric(fileMetadata$numParagTable)
paragPageScatter <- ggplot(fileMetadata, aes(x=numPage, y=numParag)) + geom_point(alpha=0.3, color="#3182bd") +
scale_x_continuous(limits = c(0,100)) +
scale_y_continuous(limits = c(0,5000)) + theme_bw()
path <- paste(distFolder, "paragPageScatter.png", sep="")
png(path, width=640, heigh=480)
print(paragPageScatter)
dev.off()
wordParagScatter <- ggplot(fileMetadata, aes(x=numWords, y=numParag)) + geom_point(alpha = 0.3, color="#3182bd") +
#wordParagScatter <- ggplot(fileMetadata, aes(x=numWords, y=numParag)) + geom_bin2d() +
scale_x_continuous(limits = c(0,30000)) +
scale_y_continuous(limits = c(0,5000)) + theme_bw()
path <- paste(distFolder, "wordParagScatter.png", sep="")
png(path, width=640, heigh=480)
print(wordParagScatter)
dev.off()
#page distributions
binPath <- "input/pageBins.txt"
binFrameNPag <- valueFrequencies(binPath, fileMetadata, "numPage")
saveBins(distFolder, "pageDist", binFrameNPag)
binFrameNPag$feature <- "numPage"
#word distributions
binPath <- "input/wordBins.txt"
binFrameNWo <- valueFrequencies(binPath, fileMetadata, "numWords")
saveBins(distFolder, "wordDist", binFrameNWo)
binFrameNWo$feature <- "numWords"
#paragraph distributions
binPath <- "input/paragraphBins.txt"
binFrameNPar <- valueFrequencies(binPath, fileMetadata, "numParag")
saveBins(distFolder, "paragDist", binFrameNPar)
binFrameNPar$feature <- "numParag"
#table distributions
binPath <- "input/tableBins.txt"
binFrameNTab <- valueFrequencies(binPath, fileMetadata, "numTables")
saveBins(distFolder, "tableDist", binFrameNTab)
binFrameNTab$feature <- "numTables"
#words in table distributions
binPath <- "input/tableWordBins.txt"
binFrameNWTab <- valueFrequencies(binPath, fileMetadata, "numWordTable")
saveBins(distFolder, "wordTableDist", binFrameNWTab)
binFrameNWTab$feature <- "numWordTable"
#paragraphs in table distributions
binPath <- "input/tableParagraphBins.txt"
binFrameNPTab <- valueFrequencies(binPath, fileMetadata, "numParagTable")
saveBins(distFolder, "paragTableDist", binFrameNPTab)
binFrameNPTab$feature <- "numParagTable"
allBinsFrame <- rbind(binFrameNPag, binFrameNPar, binFrameNTab, binFrameNWo, binFrameNWTab, binFrameNPTab)
# calculate representative samples
#combinationFrame <- expand.grid(numParag=binFrameNPar$title, numPage=binFrameNPag$title,
# numWords=binFrameNWo$title, numTables=binFrameNTab$title,
# numWordTable=binFrameNWTab$title, numParagTable=binFrameNPTab$title)
combinationFrame <- expand.grid(numParag=binFrameNPar$title, numPage=binFrameNPag$title,
numWords=binFrameNWo$title, numTables=binFrameNTab$title)
#combinationFrame <- expand.grid(numParag=binFrameNPar$title, numPage=binFrameNPag$title)
#numProp <- c("numPage", "numWords", "numParag", "numTables", "numWordTable", "numParagTable")
numProp <- c("numPage", "numWords", "numParag", "numTables")
#numProp <- c("numPage", "numParag")
numPropToSave <- c("numPage", "numWords", "numParag", "numTables", "numWordTable", "numParagTable")
samples <- apply(combinationFrame, 1, function(x) calculateSamples(x, allBinsFrame, fileMetadata, numProp, samplesPath))
#sampleFrame <- rbind(samples)
samples <- samples[!is.na(samples)]
samplesMetadata <- fileMetadata[fileMetadata$fileName %in% samples, ]
samplesScatter <- paragPageScatter + geom_point(data=samplesMetadata, aes(x=numPage, y=numParag), size=3, shape=18) +
labs(x="number of pages", y="number of paragraphs")
path <- paste(distFolder, "paragPageScatterSamples.png", sep="")
png(path, width=640, heigh=480)
print(samplesScatter)
dev.off()
samplesScatter2 <- wordParagScatter + geom_point(data=samplesMetadata, aes(x=numWords, y=numParag), size=3, shape=18) +
labs(x="number of words", y="number of paragraphs")
path <- paste(distFolder, "wordParagScatterSamples.png", sep="")
png(path, width=640, heigh=480)
print(samplesScatter2)
dev.off()
samplesMetadata <- samplesMetadata[,colnames(samplesMetadata) %in% c(numPropToSave, "fileName")]
samplesMetadata$fileName <- paste("sample_",samplesMetadata$fileName, sep="")
for (nam in numPropToSave) {
samplesMetadata[[nam]] <- paste(nam, ":", samplesMetadata[[nam]], sep="")
}
write.table(samplesMetadata, samplesPath, row.names = FALSE, col.names = FALSE, sep = "\t", quote = FALSE)