-
Notifications
You must be signed in to change notification settings - Fork 0
/
AIPInPremis.rb
224 lines (189 loc) · 8.41 KB
/
AIPInPremis.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
module Daitss
XMLRES ='<eventType>XML Resolution</eventType>'
class AIPInPremis
def initialize
@datafiles = Hash.new
@bitstreams = Hash.new
@formats = Hash.new
@anomalies = Hash.new
@inhibitors = Hash.new
@events = Hash.new
@agents = Hash.new
@relationships = Array.new
end
# process an aip descriptor described in a premis-in-mets format.
def processAIPFile aip_file
# read in the AIP descriptor
process XML::Document.file aip_file
end
# process an aip descriptor described in a premis-in-mets format.
def process package, aipxml
@package = package
@doc = aipxml
# create an new intentities or locate the existing int entities for the int entity object in the aip descriptior.
processIntEntity
# process all premis file objects
processDatafiles
# extract all premis representations
processRepresentations
# process all premis bitstreams
processBitstreams
# process all premis agents
processAgents
# process all premis events
processEvents
# process derived relationships associated with the files
fileObjects = @doc.find("//premis:object[@xsi:type='file']", NAMESPACES)
fileObjects.each do |obj|
dfid = obj.find_first("premis:objectIdentifier/premis:objectIdentifierValue", NAMESPACES).content
processRelationship(dfid, obj)
end
end
def processIntEntity
@int_entity = Intentity.new
@int_entity.fromAIP @doc
# check if this is an existing int entity, if not create a new int entity object with
# the read-in premis info. Otheriwse, destroy the existing int entity records in the database
# including all related datafiles, representations, events and agents.
entity = Intentity.first(:id => @int_entity.id)
if (entity)
entity.deleteChildren
# TODO: needs testing. delete bypasses hooks in Sequel. Similar to destroy! in DataMapper.
# those the destroy! bypass the datamapper validation, it will still delete the associated children
# dependencies. Tables that are not associated directly such as image/documents/texts/audios/object_formats
# will be cascade deleted when the datafile is deleted.
entity.destroy
end
@package.intentity = @int_entity
end
# extract representation information from the premis document
def processRepresentations
repObjects = @doc.find("//premis:object[@xsi:type='representation']", NAMESPACES)
repObjects.each do |obj|
rep_id = obj.find_first("premis:objectIdentifier/premis:objectIdentifierValue", NAMESPACES).content
files = obj.find("premis:relationship", NAMESPACES)
files.each do |f|
dfid = f.find_first("premis:relatedObjectIdentification/premis:relatedObjectIdentifierValue", NAMESPACES).content
df = @datafiles[dfid]
df.setRepresentations(rep_id) unless df.nil?
end
end
# set the origin of all datafiles by deriving the origin information from their associations with representations
@datafiles.each do |dfid, df|
df.setOrigin
end
end
# extract all file objects from the premis document
def processDatafiles
sip_descriptor_node = @doc.find_first("//M:file[@USE='sip descriptor']", NS_PREFIX)
sip_descriptor_ownerid = sip_descriptor_node['OWNERID'] if sip_descriptor_node
fileObjects = @doc.find("//premis:object[@xsi:type='file']", NAMESPACES)
obsolete_dfs = @doc.find("//mets:file[not(mets:FLocat)]", NAMESPACES).map { |n| n['OWNERID'] }.to_set
fileObjects.each do |obj|
df = Datafile.new
#GC.start
#delta_stats
df.fromPremis(obj, @formats, sip_descriptor_ownerid)
unless obsolete_dfs.include? df.id
@datafiles[df.id] = df
@int_entity.datafiles << df
end
end
end
# extract alll bitstream objects from the premis document
def processBitstreams
bitObjects = @doc.find("//premis:object[@xsi:type='bitstream']", NAMESPACES)
bitObjects.each do |obj|
bs = Bitstream.new
bs.fromPremis(obj, @formats)
@bitstreams[bs.id] = bs
end
end
# extract all agents in the premis document
def processAgents
agentObjects = @doc.find("//premis:agent", NAMESPACES)
agentObjects.each do |obj|
agent = PremisAgent.new
agent.fromPremis obj
# use the existing agent record in the database if we have seen this agent before
existingAgent = PremisAgent.first(:id => agent.id)
if existingAgent
@agents[agent.id] = existingAgent
else
@agents[agent.id] = agent
end
end
end
# extract all events from the premis document, but only the first XMLRES events
def processEvents
eventObjects = @doc.find("//premis:event", NAMESPACES).to_a
xmlresFirstEvents = Array.new
eventObjects.each_with_index do |obj,i|
id = obj.find_first("premis:linkingObjectIdentifier/premis:linkingObjectIdentifierValue", NAMESPACES).to_s
type = obj.find_first("premis:eventType", NAMESPACES)
if type.to_s == XMLRES && xmlresFirstEvents.index(id+XMLRES)
eventObjects.delete_at(i)
else
xmlresFirstEvents << id + type.to_s
end
end
eventObjects.each do |obj|
id = obj.find_first("premis:linkingObjectIdentifier/premis:linkingObjectIdentifierValue", NAMESPACES)
# make sure this event related to a datafile
df = @datafiles[id.content] unless id.nil?
agent_id = obj.find_first("premis:linkingAgentIdentifier/premis:linkingAgentIdentifierValue", NAMESPACES)
agent = @agents[agent_id.content] unless agent_id.nil?
if df #first check if this event is linked to a file object
event = DatafileEvent.new
event.fromPremis(obj, df, @anomalies)
event.setRelatedObject id.content
# associate agent to the event
agent.premis_events << event unless agent.nil?
@events[event.id] = event
elsif id && @int_entity.match(id.content) #then check if this event links to int entity
event = IntentityEvent.new
event.fromPremis(obj)
event.setRelatedObject id.content
# associate agent to the event
agent.premis_events << event unless agent.nil?
@events[event.id] = event
end
end
end
# extract and construct premis relationship among objects
def processRelationship(dfid, file_obj)
unless (@datafiles[dfid].nil?)
d_relationships = file_obj.find("premis:relationship[premis:relationshipType = 'derivation' and premis:relationshipSubType = 'has source']", NAMESPACES)
s_relationships = file_obj.find("premis:relationship[premis:relationshipType = 'structural' and premis:relationshipSubType = 'includes']", NAMESPACES)
d_relationships.each do |relationship|
event_id = relationship.find_first("premis:relatedEventIdentification/premis:relatedEventIdentifierValue", NAMESPACES)
event = @events[event_id.content] unless event_id.nil?
unless (event.nil?)
relationshipObj = Relationship.new
relationshipObj.fromPremis(dfid, event.e_type, relationship)
@relationships << relationshipObj
end
end
s_relationships.each do |relationship|
bsid = relationship.find_first("premis:relatedObjectIdentification/premis:relatedObjectIdentifierValue", NAMESPACES).content
@datafiles[dfid].bitstreams << @bitstreams[bsid] if @bitstreams[bsid]
end
end
end
# save all extracted premis objects/events/agents to the fast access database in one transaction
# TODO: may need to bypass save validation to speed up transactions
def toDB
# @datafiles.each {|dfid, df| df.check_errors unless df.save }
unless @int_entity.save
@int_entity.check_errors
raise "error in saving int entity, no validation error found"
end
unless @package.save
raise "error in saving package #{@package}"
end
# explicitly saving the dependencies.
@events.each {|id, e| raise "error saving event records #{e.inspect}" unless e.save }
@relationships.each {|rel| raise 'error saving relationship records' unless rel.save }
end
end
end