-
Notifications
You must be signed in to change notification settings - Fork 1
/
run-all-language-saturation
executable file
·65 lines (54 loc) · 1.5 KB
/
run-all-language-saturation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env bash
#
# Run the record level measure of language features
#
# Input
# The json files take place on HDFS /europeana directory
# Output
# You should specify a file name, such as resultXX-language.csv
# This will be the input of top level, and collection level aggregations
OUTPUT=$1
# JAR_VERSION=0.7-SNAPSHOT
HDFS=hdfs://localhost:54310
HEADER=header-language-saturation.csv
HEADEROUTPUT=/join/$HEADER
HEADERCRC=.$HEADER.crc
if [[ ("$#" -ne 1) || ("$OUTPUT" == "") ]]; then
echo "You should add an output file (such as resultXX-language.csv)!"
exit 1
fi
if [ -e $OUTPUT ]; then
echo "WARNING! $OUTPUT exists, rename it to prevent overwriting the file!"
exit 1
fi
echo "Starting language detection."
if hdfs dfs -test -d /result; then
hdfs dfs -rm -r -f -skipTrash /result
sleep 5
fi
if hdfs dfs -test -d $HEADEROUTPUT; then
hdfs dfs -rm -r -f -skipTrash $HEADEROUTPUT
sleep 2
fi
source ../../set-application-jar.sh
# JAR=target/europeana-qa-spark-${JAR_VERSION}-jar-with-dependencies.jar
spark-submit --class de.gwdg.europeanaqa.spark.LanguageSaturation \
--master local[*] \
$JAR \
$HDFS/europeana/*.json \
$HDFS/result \
$HDFS$HEADEROUTPUT \
data-providers.txt \
datasets.txt
echo Retrieve $OUTPUT
hdfs dfs -getmerge /result $OUTPUT
rm .$OUTPUT.crc
echo Retrieve $HEADER
hdfs dfs -getmerge $HEADEROUTPUT $HEADER
sleep 2
rm -f $HEADERCRC
if hdfs dfs -test -d $HEADEROUTPUT; then
hdfs dfs -rm -r -f -skipTrash $HEADEROUTPUT
sleep 2
fi
echo "Language detection is done!"