forked from DEIB-GECO/Metadata-Manager
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGMQLLoader.scala
124 lines (113 loc) · 5.01 KB
/
GMQLLoader.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
package it.polimi.genomics.metadata.step
import java.io.File
import java.util
import it.polimi.genomics.core.DataStructures.IRDataSet
import it.polimi.genomics.core.GDMSUserClass
import it.polimi.genomics.metadata.database.{FileDatabase, Stage}
import it.polimi.genomics.metadata.step.utils.{DatasetNameUtil, DirectoryNamingUtil}
import it.polimi.genomics.manager.ProfilerLauncher
import it.polimi.genomics.metadata.step.xml.Source
import it.polimi.genomics.repository.{GMQLRepository, GMQLSample, Utilities}
import org.slf4j.LoggerFactory
/**
* Created by Nacho on 10/17/16.
*/
class GMQLLoader {
val logger = LoggerFactory.getLogger(this.getClass)
val ut: Utilities = Utilities()
val repo: GMQLRepository = ut.getRepository()
/**
* using information in the information, will insert into GMQLRepository the files
* already downloaded, transformed and organized.
*
* Files have to be in the folder: information.outputFolder/dataset.outputFolder/Transformations/
* and have to be sorted as pairs (file,file.meta) and also the .schema file have to be in the same folder.
*
* The process will look for every file ".meta" and will try to get its pair file.
*
* The GMQLTransformer should put inside the folder just the necessary files.
*
* @param source contains files location and datasets organization
*/
def loadIntoGMQL(source: Source): Unit = {
val gmqlUser = source.parameters.filter(_._1 == "gmql_user").head._2
repo.registerUser(gmqlUser)
val stage = Stage.TRANSFORM
logger.info("Preparing for loading datasets into GMQL")
source.datasets.foreach(dataset => {
logger.debug("dataset " + dataset.name)
if (dataset.loadEnabled) {
val path = source.outputFolder + File.separator + dataset.outputFolder + File.separator + DirectoryNamingUtil.flattenFolderName
val listAdd = new java.util.ArrayList[GMQLSample]()
val datasetId = FileDatabase.datasetId(FileDatabase.sourceId(source.name), dataset.name)
FileDatabase.getFilesToProcess(datasetId, stage).filter(_._2.endsWith(".meta")).foreach(file => {
val fileName = if (file._3 == 1) file._2 else file._2.replaceFirst("\\.", "_" + file._3 + ".")
try {
listAdd.add(GMQLSample(
path + File.separator + fileName.substring(0, fileName.lastIndexOf(".meta")),
path + File.separator + fileName,
null))
} catch {
case e: Throwable => logger.warn("data or metadata files missing: " + path + File.separator + fileName + ". more details: " + e.getMessage)
}
})
if (listAdd.size() > 0) {
logger.info("Trying to add " + dataset.name + " to user: " + gmqlUser)
val datasetName = DatasetNameUtil.loadDatasetName(dataset)
//if repo exists I do DELETE THEN ADD
if (dsExists(gmqlUser, datasetName))
try {
// repo.deleteDS(datasetName, gmqlUser)
logger.info("The dataset exists, skipped: " + datasetName)
} catch {
//should be GMQLDSNotFound but dont know yet where it is.
case e: Exception => logger.info("Dataset " + datasetName + " is not defined before!!")
}
else
try {
repo.importDs(
datasetName,
gmqlUser,
GDMSUserClass.PUBLIC,
listAdd,
path + File.separator + dataset.name + ".schema")
logger.info("import for dataset " + dataset.name + " completed")
ProfilerLauncher.profileDS(gmqlUser, datasetName)
logger.info("profiler for dataset " + dataset.name + " completed")
val description =
if (dataset.parameters.exists(_._1 == "loading_description"))
Some(dataset.parameters.filter(_._1 == "loading_description").head._2)
else
None
if (description.nonEmpty)
repo.setDatasetMeta(datasetName, gmqlUser, Map(" Description" -> description.get))
repo.setDatasetMeta(datasetName, gmqlUser, Map(" Download date" -> FileDatabase.getLastDownloadDate(datasetId)))
}
catch {
case e: Throwable => logger.error("import failed: ", e)
}
}
else
logger.info("dataset " + dataset.name + " has no files to be loaded")
}
else
logger.debug("dataset " + dataset.name + " not included to load.")
})
}
/**
* by checking in the GMQLRepository indicates if the dataset exists
*
* @param username user for the datasets to be added
* @param datasetName name of the dataset to check
* @return whether the dataset exists for the username given
*/
def dsExists(username: String, datasetName: String): Boolean = {
val dss: util.List[IRDataSet] = repo.listAllDSs(username)
var exists = false
for (i <- 0 until dss.size()) {
if (dss.get(i).position == datasetName)
exists = true
}
exists
}
}