-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraject_config.rb
137 lines (110 loc) · 5.92 KB
/
traject_config.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# A sample traject configuration, stripped down version of
# https://github.com/traject/traject/blob/master/test/test_support/demo_config.rb
# To run, `traject -c traject_config.rb marc_file.marc`
# To have access to various built-in logic
# for pulling things out of MARC21, like `marc_languages`
require 'traject/macros/marc21_semantics'
extend Traject::Macros::Marc21Semantics
# To have access to the traject marc format/carrier classifier
require 'traject/macros/marc_format_classifier'
extend Traject::Macros::MarcFormats
require 'traject/csv_writer'
require 'traject/delimited_writer'
# In this case for simplicity we provide all our settings in this one file.
# But you could choose to separate them into antoher config file; divide things
# between files however you like, you can call traject with as many
# config files as you like, `traject -c one.rb -c two.rb -c etc.rb`
settings do
# Source type
provide "marc_source.type", "binary" # default
# provide "marc_source.type", "xml"
# provide "marc_source.type", "json"
# Character encoding
# Traject works with everything internally as UTF-8, but we need
# to be able to consume MARC-8 content as well.
provide "marc_source.encoding", 'UTF-8' # *usually* the default
# provide "marc_source.encoding", 'MARC-8'
# Tell it which kind of writer to use.
# provide "writer_class_name", "Traject::JsonWriter"
# provide "writer_class_name", "Traject::SolrJsonWriter" # default
# provide "writer_class_name", "Traject::YamlWriter"
# provide "writer_class_name", "Traject::DebugWriter"
# provide "writer_class_name", "Traject::DelimitedWriter" # default tsv
# provide "delimited_writer.delimiter", "\t"
provide "writer_class_name", "Traject::CSVWriter"
# If we were indexing into solr we would give its url
#provide "solr.url", "http://solr.somewhere.edu:8983/solr/corename"
# List of fields for DelimitedWriter and CSVWriter
provide "delimited_writer.fields", "author_display,title_display"
end
# Extract first 001, then supply code block to add "bib_" prefix to it
to_field "id", extract_marc("001", :first => true) do |marc_record, accumulator, context|
accumulator.collect! {|s| "bib_#{s}"}
end
# An exact literal string, always this string:
to_field "source", literal("traject_test_last")
to_field "publisher_t", extract_marc("260abef:261abef:262ab:264ab")
to_field "published_display", extract_marc("260a", :trim_punctuation => true)
to_field "isbn_t", extract_marc("020a:773z:776z:534z:556z")
to_field "issn", extract_marc("022a:022l:022y:773x:774x:776x", :separator => nil)
to_field "lccn", extract_marc("010a")
to_field "material_type_display", extract_marc("300a", :separator => nil, :trim_punctuation => true)
# Which "title" do we want?
to_field "title_t", extract_marc("245ak")
#to_field "title1_t", extract_marc("245abk")
#to_field "title2_t", extract_marc("245nps:130:240abcdefgklmnopqrs:210ab:222ab:242abcehnp:243abcdefgklmnopqrs:246abcdefgnp:247abcdefgnp")
#to_field "title3_t", extract_marc("700gklmnoprst:710fgklmnopqrst:711fgklnpst:730abdefgklmnopqrst:740anp:505t:780abcrst:785abcrst:773abrst")
to_field "title_display", extract_marc("245abk", :trim_punctuation => true, :first => true)
to_field "title_series_t", extract_marc("440a:490a:800abcdt:400abcd:810abcdt:410abcd:811acdeft:411acdef:830adfgklmnoprst:760ast:762ast")
to_field "author_display", extract_marc("100abcdq:110:111")
to_field "subject_t", extract_marc("600:610:611:630:650:651avxyz:653aa:654abcvyz:655abcvxyz:690abcdxyz:691abxyz:692abxyz:693abxyz:656akvxyz:657avxyz:652axyz:658abcd")
# Built in logic for things that involve more than
# a single field extraction.
# https://github.com/traject/traject/tree/master/lib/traject/macros
to_field "series_facet", marc_series_facet
to_field "language_facet", marc_languages
to_field "format", marc_formats
to_field "subject_geo_facet", marc_geo_facet
to_field "subject_era_facet", marc_era_facet
to_field "pub_date", marc_publication_date
to_field "oclcnum_t", oclcnum
to_field "sortable_title", marc_sortable_title
to_field "instrumentation_facet", marc_instrumentation_humanized
to_field "instrumentation_code_unstem", marc_instrument_codes_normalized
# An example of more complex ruby logic 'in line' in the config file--
# too much more complicated than this, and you'd probably want to extract
# it to an external routine to keep things tidy.
#
# Use traject's LCC to broad category routine, but then supply
# custom block to also use our local holdings 9xx info, and
# also classify sudoc-possessing records as 'Government Publication' discipline
to_field "discipline_facet", marc_lcc_to_broad_category(:default => nil) do |record, accumulator|
# add in our local call numbers
Traject::MarcExtractor.cached("991:937").each_matching_line(record) do |field, spec, extractor|
# we output call type 'processor' in subfield 'f' of our holdings
# fields, that sort of maybe tells us if it's an LCC field.
# When the data is right, which it often isn't.
call_type = field['f']
if call_type == "sudoc"
# we choose to call it:
accumulator << "Government Publication"
elsif call_type.nil? ||
call_type == "lc" ||
Traject::Macros::Marc21Semantics::LCC_REGEX.match(field['a'])
# run it through the map
s = field['a']
s = s.slice(0, 1) if s
accumulator << Traject::TranslationMap.new("lcc_top_level")[s]
end
end
# If it's got an 086, we'll put it in "Government Publication", to be
# consistent with when we do that from a local SuDoc call #.
if Traject::MarcExtractor.cached("086a").extract(record).length > 0
accumulator << "Government Publication"
end
# uniq it in case we added the same thing twice with GovPub
accumulator.uniq!
if accumulator.empty?
accumulator << "Unknown"
end
end