-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmetadata-to-tables.py
65 lines (47 loc) · 3.11 KB
/
metadata-to-tables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Script to extract metadata from some standardized .txt metadata files
# this script accepts FGDC metadata in .txt format, and extracts every enumerated domain to a .csv file.
# Change the infile and outfolder variables to appropriate values for your project.
# Python 3.6.7
# Caroline Rose | [email protected] | Nov 9, 2018
import re # Import the regex module.
import csv #csv module
import os #operating system module
infile = "P:\\Map_Data_Model\\MAPS\\Forest_Co_Pleisto\\Forest_InitialData\\fr_pleis_metadata_mod.txt"
outfolder = "P:\\Map_Data_Model\\MAPS\\Forest_Co_Pleisto\\extracted domains\\"
def findDomains(infile):
with open (infile, 'rt') as in_file: # open file for reading text.
fileparts = in_file.read().split('Detailed Description:') # split into separate sections, using the phrase "Detailed Description" as a delimiter.
for i in fileparts:
if "Entity Type:" in i: #this item contains the description of a layer.
layerName = (re.search('Entity Type Label:(.*)\n', i)).group(1) #extract the name of the layer, which falls between the string "Entity Type Label:" and the next newline.
attributeName = (re.search('Attribute Label:(.*)\n', i)).group(1)
extractDomain(layerName.strip(), attributeName.strip(), i)
def extractDomain(layerName, attributeName, text):
#create empty lists to hold domain values and definitions.
domainvalues = [] # The list where we will store results.
domaindefs = []
pattern1 = re.compile("Enumerated Domain Value: ", re.IGNORECASE) # Compile regular expression to match lines containing the string
pattern2 = re.compile("Enumerated Domain Value Definition: ", re.IGNORECASE)
for line in text.split('\n'):
if pattern1.search(line) != None: # If pattern search finds a match,
value = line.split("Value:")[1] # isolate the text after the string "Value:"
v = value.rstrip('\n') # strip newline character from the end of the string
e = v.lstrip()
domainvalues.append(e)
if pattern2.search(line) != None:
definition = line.split("Definition:")[1]
d = definition.rstrip('\n') #strip newlines from the right side of the string
f = d.lstrip(' ') #strip spaces from the left side of the string
domaindefs.append(f)
print("Domain values:", domainvalues)
# print("Domain defs:", domaindefs)
#Combine the two lists
pairedVals = [] #empty list
for v in domainvalues:
pairList = [v, domaindefs[domainvalues.index(v)]]
pairedVals.append(pairList)
#output to csv
with open(outfolder+layerName+"_"+attributeName+'_domain.csv', 'w') as outfile:
writer = csv.writer(outfile, lineterminator='\n')
writer.writerows(pairedVals)
findDomains(infile)