-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWindowsExtractor.py
131 lines (107 loc) · 4.62 KB
/
WindowsExtractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pdfplumber
from pdfminer.layout import LTChar
from typing import List, Tuple
import os
import shutil
pdf = pdfplumber.open('CIS-windows.pdf')
shutil.rmtree("./MD-files-windows")
os.mkdir("MD-files-windows")
def getSectionNumber(pgNumber):
page = pdf.pages[pgNumber]
text = page.extract_text()
if text[2].isdigit() == True:
sectionNumber = ""
i = 2
while text[i] != " ":
sectionNumber += text[i]
i += 1
return sectionNumber
else:
return False
def getRemediation(pgNumber):
page = pdf.pages[pgNumber]
text = page.extract_text()
while text.find("Remediation:") == -1:
pgNumber += 1
page = pdf.pages[pgNumber]
text = page.extract_text()
if text[2].isdigit() == True:
return False # end the function if there is no remediation for that page number section
# find page number where there is a remediation for the section
# Get pdf page as char list (contains font info)
chars: List[LTChar] = page.chars
# True if font is CourierNewPSMT (code block font)
monospaceFontIndices: List[bool] = [chars[i]["fontname"] ==
"ABCDEE+CourierNewPSMT" for i in range(len(chars))]
# Re-generate text using chars
text = "".join([char["text"] for char in chars])
codeIndices = getTrueRanges(monospaceFontIndices)
text = addFormat(text, codeIndices, "`")
remediationStart = text.find("Remediation:") + 13
remediationEnd = text.find("Default Value:")
if remediationEnd != -1:
return text[remediationStart : remediationEnd]
else:
return text[remediationStart : len(text)]
def getTrueRanges(list: List[bool]) -> 'List[Tuple[int, int]]':
""" Returns the ranges of true values in a boolean list.
"""
indiceRanges: 'List[Tuple[int, int]]' = []
alreadyCheckedIndices: int = -1
for index, item in enumerate(list):
# Skip if already checked
if index <= alreadyCheckedIndices:
continue
# Check for true values after the item if item is true
if item:
for index2, item2 in enumerate(list[index::]):
index2 += index
# Once the value is not true, return the range up to 1 index lower than it
if not item2:
indiceRanges.append((index, index2 - 1))
alreadyCheckedIndices = index2 - 1
break
# if the end of the list is reached, return index to end
else:
# Note: for-else behavior might break in future versions of python
indiceRanges.append((index, len(list) - 1))
return indiceRanges
return indiceRanges
def addFormat(text: str, codeIndices: 'List[Tuple[int, int]]', formatChar: str) -> str:
""" Format text as code blocks or bolds by adding ` or * to the start and end
"""
textArr: List[str] = [char for char in text]
for index, codeBlock in enumerate(codeIndices):
textArr.insert(codeBlock[0] + 2*index, formatChar)
textArr.insert(codeBlock[1] + 2*index + 2, formatChar)
return "".join(textArr)
sectionFilesWritten = []
def writeToFile(sectionNum: int, remediation: str):
numOfPeriods = 0
fileSectionNumber = ""
folderSectionNumber = ""
for char in sectionNum:
if char == ".":
numOfPeriods += 1
if numOfPeriods == 2:
break
fileSectionNumber += char
for char in sectionNum:
if char == ".":
break
folderSectionNumber += char
file = open("./MD-files-windows/" + folderSectionNumber + "/" + fileSectionNumber + ".md","a")
file.write("\n" + "### " + sectionNum + " ")
file.write("\n" + remediation.replace("\u25cf", "").replace("\uf0b7", ""))
file.close()
file2 = open("./MD-files-windows/" + folderSectionNumber + ".md", "a+")
if not fileSectionNumber in sectionFilesWritten:
file2.write("\n" + "## " + fileSectionNumber + " ")
file2.write("\n ```{include} ./" + folderSectionNumber + "/" + fileSectionNumber + ".md" + "\n ``` \n")
sectionFilesWritten.append(fileSectionNumber)
file2.close()
for i in range(1,20):
os.mkdir("./MD-files-windows/" + str(i)) # make folders for placing MD files
for pageNumber in range(55,1196): # in practice should be up to 1196
if getSectionNumber(pageNumber) != False and getRemediation(pageNumber) != False: # if the section number exists on that page, and that section has a remediation
writeToFile(getSectionNumber(pageNumber), getRemediation(pageNumber))