-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgulpease.sh
137 lines (100 loc) · 3.44 KB
/
gulpease.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/bash
##### Consts
# const for output auxiliary "optimized" plain text files
PDFTOTEXT=pdftotext-output
# const to set MINIMUM ASSUME-VALID length (above this value, lines will NOT be checked further and will be added to validLines array)
MINIMUM_ASSUME_VALID_LENGTH=25
# backup of default IFS
OIFS=$IFS
source ./common.config
echo Examining document named "$1"
echo "Create 'pdftotext-output' directory if not existing"
mkdir -p "$PDFTOTEXT"
# prepare file name and path
file=./"$PDFTOTEXT"/$(getFileNameWithoutExtension "$1")
echo Converting to plain text
pdftotext "$1" "$file"
# text coming from file
var=$(<"$file")
# delete all empty lines and save the remaining lines in var and in the original file
sed -i.bak '/^$/d' "$file"
var=$(<"$file")
# changing IFS to characters which mean line-break
IFS=$'\n'
# split text file in each line
lines=($var)
# array containing only "valid" lines
validLines=()
# check for each line if it could be a title and such
for line in ${lines[@]};
do
# debug
# echo "$line"
### Common patterns easy enough to detect and ommit from output
# if the line is less then 3 chars go ahead with the loop
lineLength=${#line}
if [[ "$lineLength" -lt "3" ]] ; then
# echo Skipping line which length is "$lineLength"
continue
fi
# if line ends with multiple dots (possibly separated by a space)
if [[ "$line" =~ \.(\.|[[:space:]])+$ ]] ; then
# echo Skipping line which ends with multiple dots \("$line"\)
continue
fi
# if line contains only numbers separated by '.' or '/' or '-'
if [[ "$line" =~ ^([[:digit:]]+(\.|-|/)*)+$ ]] ; then
# echo Skipping line which contains only numbers \("$line"\)
continue
fi
# save first character and last character
firstChar="${line:0:1}"
lastChar="${line: -1}"
# debug
# echo First char is "$firstChar" and last char is "$lastChar"x
# if first character is capitalized or is a digit the line might be a title that we need to remove from the file:
# we conferm this by checking the last character of the line.
# If the string is above minimum length add it without checking.
# Else check if the last character of the line is a sentence-break separator: if it is than add the line to validLines
if [[ "$lineLength" -gt "$MINIMUM_ASSUME_VALID_LENGTH" ]] ; then
validLines+=("$line")
elif [[ "$lastChar" == [?\.\!] ]]; then
validLines+=("$line")
fi
done
# restore original IFS
IFS="$OIFS"
# output of "optimized" doc
printf "%s\n" "${validLines[@]}" > "$file".output
# read the new "optimized" file
var=$(<"$file".output)
# changing IFS to characters which mean sentence-break
IFS=.\!?$'\n'
# split original text using IFS in to an array
sentences=($var)
# restore IFS to its original value
IFS=$OIFS
# split original text using original IFS (space, tab & newline)
words=($var)
bigWords=()
# count each character which is of class [:alpha:]
totalLetters=$(tr -d -C [:alpha:] <"$file".output | wc -c)
totalSentences=${#sentences[@]}
totalWords=${#words[@]}
# debug
#for (( i=0; i<${totalSentences}; i++ ));
#do
# echo ${sentences[$i]}
#done
#for (( i=0; i<${totalWords}; i++ ));
#do
# echo ${words[$i]}
#done
echo "$totalSentences"" sentences"
echo "$totalWords"" total words"
echo "$totalLetters"" total letters"
first=$((300 * $totalSentences))
second=$((10 * $totalLetters))
quotient=$((($first - $second) / $totalWords))
gulpeaseIndex=$((89 + $quotient))
echo "Gulpease Index: ""$gulpeaseIndex"