-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysis_oxford.sh
executable file
·108 lines (71 loc) · 3.13 KB
/
analysis_oxford.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/bin/bash
if [ $# -ne 2 ]
then
echo "Usage: $0 METADATA_DIRECTORY OUTPUT_DIRECTORY"
exit 1
else
METADATA_DIRECTORY=$1
OUTPUT_DIRECTORY=$2
fi
function markdownify
{
FILE_TO_MARKDOWNIFY="$1"
sed -r -e 's/^(\s*[0-9]+)\s/\1|/g ' -i $FILE_TO_MARKDOWNIFY
sed -e 's/^\s/|/g ' -i $FILE_TO_MARKDOWNIFY
sed -e 's/$/|/g ' -i $FILE_TO_MARKDOWNIFY
}
#JOURNALS
FILENAME=$OUTPUT_DIRECTORY/journals.txt
grep -ohr '<journal-title>.*<\/journal-title>' $METADATA_DIRECTORY | sed -e 's/<[\/]\{0,1\}journal-title>//g' | sort | uniq -ci | sort -n > $FILENAME
markdownify $FILENAME
#ISSN-P
FILENAME=$OUTPUT_DIRECTORY/pissn.txt
grep -ohr '<issn pub-type\=\"ppub\">[^>]*<\/issn>' $METADATA_DIRECTORY | sed -e 's/<[\/]\{0,1\}issn[^>]*>//g' | sort | uniq -c | sort -n > $FILENAME
markdownify $FILENAME
#DOCTYPES
FILENAME=$OUTPUT_DIRECTORY/doctypes.txt
#doctypes for cambridge are split over multiple lines
grep -ohrE '(PUBLIC|SYSTEM|public|system)[^>]*dtd\">' $METADATA_DIRECTORY | sort | uniq -c | sort -n > $FILENAME
markdownify $FILENAME
#NAMESPACES
FILENAME=$OUTPUT_DIRECTORY/namespaces.txt
grep -ohr 'xmlns:\S*\="\S*"' $METADATA_DIRECTORY | sort | uniq -c | sort -n > $FILENAME
markdownify $FILENAME
#COPYRIGHT YEARS
FILENAME=$OUTPUT_DIRECTORY/copyright-years.txt
grep -ohr "<copyright-year>.*<\/copyright-year>" $METADATA_DIRECTORY | sed -e 's/<[\/]\{0,1\}copyright-year>//g' | sort | uniq -c > $FILENAME
markdownify $FILENAME
#ENCODINGS
FILENAME=$OUTPUT_DIRECTORY/encodings.txt
grep -ohr 'encoding\="[^"]*' $METADATA_DIRECTORY | sed -e 's/encoding\="//g' | sort | uniq -ci > $FILENAME
markdownify $FILENAME
#FIELDS
FIELDS=$OUTPUT_DIRECTORY/fields.txt
#initiate writing
echo -n "" > $FIELDS
echo -n "nombre d'articles |" >> $FIELDS
grep -ohr '<\/article>' $METADATA_DIRECTORY | wc -l >> $FIELDS
echo -n "doi |" >> $FIELDS
grep -ohr '<article-id pub-id-type\=\"doi\">' $METADATA_DIRECTORY | wc -l >> $FIELDS
echo -n "nombre de champs journal-title |" >> $FIELDS
grep -ohr '<\/journal-title>' $METADATA_DIRECTORY | wc -l >> $FIELDS
echo -n 'nombre de champs abbrev-journal-title abbrev-type=full |' >> $FIELDS
grep -ohr '<abbrev-journal-title abbrev-type\=\"full\">' $METADATA_DIRECTORY | wc -l >> $FIELDS
echo -n 'nombre de article titles |' >> $FIELDS
grep -ohr '<\/article-title>' $METADATA_DIRECTORY | wc -l >> $FIELDS
echo -n "abstract |" >> $FIELDS
grep -ohr '<abstract>' $METADATA_DIRECTORY | wc -l >> $FIELDS
echo -n "eissn |" >> $FIELDS
grep -ohr '<issn pub-type\=\"epub\">' $METADATA_DIRECTORY | wc -l >> $FIELDS
echo -n "pissn |" >> $FIELDS
grep -ohr '<issn pub-type\=\"ppub\">' $METADATA_DIRECTORY | wc -l >> $FIELDS
echo -n "publisher |" >> $FIELDS
grep -ohr '<\/publisher-name>' $METADATA_DIRECTORY | wc -l >> $FIELDS
echo -n "keywords groups |" >> $FIELDS
grep -ohr '<\/kwd-group>' $METADATA_DIRECTORY | wc -l >> $FIELDS
echo -n "contrib groups |" >> $FIELDS
grep -ohr '<\/contrib-group>' $METADATA_DIRECTORY | wc -l >> $FIELDS
echo -n "auteurs |" >> $FIELDS
grep -ohr '<contrib contrib-type\=\"author\"' $METADATA_DIRECTORY | wc -l >> $FIELDS
echo -n "nombre d'affiliations |" >> $FIELDS
grep -ohr '<\/aff>' $METADATA_DIRECTORY | wc -l >> $FIELDS