-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessMetadata.R
71 lines (43 loc) · 2.48 KB
/
processMetadata.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
library(ggplot2)
source('utils.R')
experiment <- "Experiment_345"
pathDocuments <- paste(basePath, "Generated/", experiment, "/Documents/", sep="")
pathMetadata <- paste(basePath, "Generated/", experiment, "/GroundTruth/Metadata/", sep="")
listFiles <- list.files(pathDocuments)
dfHolder <- data.frame(docName=character(), name=character(), source=character(), value=character())
for (f in listFiles) {
name <- unlist(strsplit(f, ".", fixed=TRUE))[1]
fileMetadata <- paste(pathMetadata, name, ".xml", sep="")
print(fileMetadata)
tmp <- readTestMetadata(fileMetadata)
dfHolder <- rbind(dfHolder, tmp)
}
dfHolder$measureName <- paste(dfHolder$name, ".", dfHolder$source, sep="")
testMetadata <- data.frame(testCaseName=unique(dfHolder$testCaseName))
uniqMeasures <- unique(dfHolder$measureName)
for (meas in uniqMeasures) {
dfTmp <- dfHolder[dfHolder$measureName==meas,]
dfTmp <- dfTmp[,names(dfTmp) %in% c("testCaseName", "value")]
testMetadata <- merge(x=testMetadata, y=dfTmp, by=c("testCaseName"), all.x = TRUE)
}
colnames(testMetadata) <- c("testCaseName", uniqMeasures)
#PLOT distribution of formats#
#testMetadata$externalIdentifier.Fits <- as.character(testMetadata$externalIdentifier.Fits)
formatHistPlot <- ggplot(testMetadata, aes(testMetadata$externalIdentifier.Fits)) +
geom_bar(stat="count", color="#3182bd", fill="#3182bd") + theme_bw()
formatHistPlot
testMetadata$number_of_pages.DataGenerator <- as.numeric(as.character(testMetadata$number_of_pages.DataGenerator))
testMetadata$number_of_paragraphs.DataGenerator <- as.numeric(as.character(testMetadata$number_of_paragraphs.DataGenerator))
sizePlot <- ggplot(testMetadata, aes(x=as.numeric(size.Fits))) + geom_histogram()
sizePlot
#pageCountHist <- ggplot(testMetadata, aes(x=pagecount.GenerationProcess)) + geom_bar()
pageCountHist <- ggplot(testMetadata, aes(x=number_of_pages.DataGenerator)) + geom_histogram(aes(y=..count..), colour = "black", fill = "grey") +
labs(x="Page Count", y="Count")
pageCountHist
paragraphCountHist <- ggplot(testMetadata, aes(x=number_of_paragraphs.DataGenerator)) + geom_histogram(aes(y=..count..), colour = "black", fill = "grey") +
labs(x="Paragraph Count", y="Count")
paragraphCountHist
pageparagScater <- ggplot(testMetadata, aes(x=number_of_pages.DataGenerator, y=number_of_paragraphs.DataGenerator)) + geom_point() +
labs(x="Page Count", y="Paragraph Count") + theme_bw() +
theme(axis.text = element_text(size=20), axis.title = element_text(size=20))
pageparagScater