From 4e50236f7ed09b5651b90f1411916ffa54f324f2 Mon Sep 17 00:00:00 2001
From: Edgars Voroboks <edgars.voroboks@gmail.com>
Date: Wed, 2 Dec 2020 22:43:34 +0200
Subject: [PATCH] Initial commit of version 1.1

---
 README.md                           |   0
 xmlutils/bin/xmlkvrecursive.py      | 117 ++++++++++++++++++++++++++++
 xmlutils/bin/xmlprettyprint.py      |  92 ++++++++++++++++++++++
 xmlutils/bin/xmlsplit.py            |  89 +++++++++++++++++++++
 xmlutils/bin/xmlstripdeclaration.py |  34 ++++++++
 xmlutils/default/app.conf           |  18 +++++
 xmlutils/default/commands.conf      |  27 +++++++
 xmlutils/default/searchbnf.conf     |  66 ++++++++++++++++
 xmlutils/metadata/default.meta      |  19 +++++
 xmlutils/test/entity.csv            |  10 +++
 10 files changed, 472 insertions(+)
 create mode 100644 README.md
 create mode 100755 xmlutils/bin/xmlkvrecursive.py
 create mode 100755 xmlutils/bin/xmlprettyprint.py
 create mode 100755 xmlutils/bin/xmlsplit.py
 create mode 100755 xmlutils/bin/xmlstripdeclaration.py
 create mode 100644 xmlutils/default/app.conf
 create mode 100644 xmlutils/default/commands.conf
 create mode 100644 xmlutils/default/searchbnf.conf
 create mode 100644 xmlutils/metadata/default.meta
 create mode 100644 xmlutils/test/entity.csv

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/xmlutils/bin/xmlkvrecursive.py b/xmlutils/bin/xmlkvrecursive.py
new file mode 100755
index 0000000..bdff1bf
--- /dev/null
+++ b/xmlutils/bin/xmlkvrecursive.py
@@ -0,0 +1,117 @@
+# Copyright (C) 2010 Splunk Inc.  All Rights Reserved.  Version 4.0
+import sys,splunk.Intersplunk
+import re
+import urllib
+import xml.sax
+import xml.sax.saxutils as saxutils
+from xml.sax.handler import ContentHandler
+from xml.sax.handler import EntityResolver
+from xml.sax.xmlreader import InputSource
+import StringIO
+import types
+
+class NullInputSource(InputSource):
+    def getByteStream(self):
+        return StringIO.StringIO("entity files not supported.")
+
+class NullEntityResolver(EntityResolver):
+    def resolveEntity(self,publicId,systemId):
+        return NullInputSource()
+
+class XmlHandler(ContentHandler):
+    def __init__(self, flatten):
+        self.flatten = flatten
+
+    def reset(self):
+        self.key_prefix = []
+        self.keys_seen = []
+        self.new_fields = {}
+
+    def getNewFields(self):
+        return self.new_fields
+
+    def setValue( self, value, suffix='' ):
+        dest_key = '_'.join(self.key_prefix) + suffix
+
+        if( len( str(value).strip() ) > 0 ):
+            #handle multiple values
+            if dest_key in self.new_fields:
+                self.new_fields['multi values'] = 'yep'
+                #this is only the second value, so convert value to a list
+                if type(self.new_fields[dest_key]) is not types.ListType:
+                    self.new_fields[dest_key] = [self.new_fields[dest_key]]
+                #append the value to the list
+                self.new_fields[dest_key].append(str(value))
+            else:
+                #insert the simple value
+                self.new_fields[dest_key] = str(value)
+
+    def startElement(self, name, attrs):
+        self.key_prefix.append(name)
+
+        #if flatten is set, then create a new prefix if this prefix has already been used
+        if flatten and '_'.join(self.key_prefix) in self.keys_seen:
+            self.key_prefix.pop()
+            count = 2
+            newName = name + '[' + str(count) + ']'
+            while '_'.join(self.key_prefix) + '_' + newName in self.keys_seen:
+                count += 1
+                newName = name + '[' + str(count) + ']'
+            self.key_prefix.append(newName)
+
+        self.keys_seen.append( '_'.join(self.key_prefix) )
+
+        if attrs.getLength() > 0:
+            for k in attrs.getNames():
+                self.setValue( attrs.getValue(k), "-" + k )
+
+    def characters(self, content):
+        if content is not None and content.strip() is not '':
+            self.setValue( content.strip() )
+
+    def endElement(self, name):
+        self.key_prefix.pop()
+
+
+try:
+    results,dummyresults,settings = splunk.Intersplunk.getOrganizedResults()
+
+    keywords, argvals = splunk.Intersplunk.getKeywordsAndOptions()
+
+    flatten = argvals.get("flatten", "False")
+    if flatten.strip().lower() in ['true','1','yes']:
+        flatten = True
+    else:
+        flatten = False
+
+    handler = XmlHandler(flatten)
+
+    for r in results:
+        try:
+            if 'xml' in r:
+                xml_text = r['xml']
+            else:
+                raw = r["_raw"]
+
+                xml_text = raw[ raw.index( '<' ) : raw.rindex( '>' )+1 ]
+
+            handler.reset()
+
+            parser = xml.sax.make_parser()
+            parser.setContentHandler(handler)
+            parser.setEntityResolver(NullEntityResolver())
+            parser.parse(StringIO.StringIO(xml_text))
+
+            for k,v in handler.getNewFields().iteritems():
+                r[k] = v
+        except:
+            import traceback
+            stack =  traceback.format_exc()
+            r['_raw'] = "Failed to parse: " + str(stack) + "\n" + r['_raw']
+
+except:
+    import traceback
+    stack =  traceback.format_exc()
+    results = splunk.Intersplunk.generateErrorResults("Error : Traceback: " + str(stack))
+
+splunk.Intersplunk.outputResults( results )
diff --git a/xmlutils/bin/xmlprettyprint.py b/xmlutils/bin/xmlprettyprint.py
new file mode 100755
index 0000000..e21792c
--- /dev/null
+++ b/xmlutils/bin/xmlprettyprint.py
@@ -0,0 +1,92 @@
+# Copyright (C) 2010 Splunk Inc.  All Rights Reserved.  Version 4.0
+import sys,splunk.Intersplunk
+import xml.sax
+import xml.sax.saxutils as saxutils
+from xml.sax.handler import ContentHandler
+from xml.sax.handler import EntityResolver
+from xml.sax.xmlreader import InputSource
+import StringIO
+
+class NullInputSource(InputSource):
+    def getByteStream(self):
+        return StringIO.StringIO("entity files not supported.")
+
+class NullEntityResolver(EntityResolver):
+    def resolveEntity(self,publicId,systemId):
+        return NullInputSource()
+
+class XmlHandler(ContentHandler):
+    def __init__(self):
+        self.indent = 0
+
+    def reset(self , r):
+        self.current_output = ''
+        self.indent = 0
+        self.open_tag = ''
+
+    def getOutput(self):
+        return self.current_output
+
+    def startElement(self, name, attrs):
+        self.open_tag = name
+        self.current_output += '\n' + '  ' * self.indent
+        self.indent += 1
+        self.current_output += '<' + name
+
+        if attrs.getLength() > 0:
+            for k in attrs.getNames():
+                self.current_output += ' ' + k + '=' + saxutils.quoteattr(attrs.getValue(k))
+        self.current_output += '>'
+
+    def characters(self, content):
+        if len(content.strip()) > 0:
+#            self.current_output += '  ' * self.indent
+            self.current_output += saxutils.escape( content ) #+ '\n'
+
+    def endElement(self, name):
+        self.indent -= 1
+        if self.open_tag != name:
+            self.current_output += '\n' + '  ' * self.indent
+        self.current_output += '</' + name + '>'
+
+
+try:
+    results,dummyresults,settings = splunk.Intersplunk.getOrganizedResults()
+
+    handler = XmlHandler()
+
+    for r in results:
+        try:
+            if 'xml' in r:
+                xml_text = r['xml']
+                dest_field = 'xml'
+            else:
+                raw = r["_raw"]
+                dest_field = '_raw'
+
+                xml_text = raw[ raw.index( '<' ) : raw.rindex( '>' )+1 ]
+
+            handler.reset(xml_text)
+            parser = xml.sax.make_parser()
+            parser.setContentHandler(handler)
+            parser.setEntityResolver(NullEntityResolver())
+            parser.parse(StringIO.StringIO(xml_text))
+
+            r[dest_field] = handler.getOutput()
+
+            if 'xml' in r:
+                xml_text = r['xml']
+            else:
+                raw = r["_raw"]
+
+        except:
+            import traceback
+            stack =  traceback.format_exc()
+            r['_raw'] = "Failed to parse: " + str(stack) + "\n" + r['_raw']
+
+except:
+    import traceback
+    stack =  traceback.format_exc()
+    results = splunk.Intersplunk.generateErrorResults("Error : Traceback: " + str(stack))
+
+splunk.Intersplunk.outputResults( results )
diff --git a/xmlutils/bin/xmlsplit.py b/xmlutils/bin/xmlsplit.py
new file mode 100755
index 0000000..54d03de
--- /dev/null
+++ b/xmlutils/bin/xmlsplit.py
@@ -0,0 +1,89 @@
+# Copyright (C) 2010 Splunk Inc.  All Rights Reserved.  Version 4.0
+import sys,splunk.Intersplunk
+import re
+import xml.sax
+import xml.sax.saxutils as saxutils
+from xml.sax.handler import ContentHandler
+from xml.sax.handler import EntityResolver
+from xml.sax.xmlreader import InputSource
+import copy
+import StringIO
+
+
+class NullInputSource(InputSource):
+    def getByteStream(self):
+        return StringIO.StringIO("entity files not supported.")
+
+class NullEntityResolver(EntityResolver):
+    def resolveEntity(self,publicId,systemId):
+        return NullInputSource()
+
+class XmlHandler(ContentHandler):
+    def __init__(self, field):
+        self.field = field
+
+    def reset(self , newResults):
+        self.current_output = ''
+        self.newResults = newResults
+
+    def startElement(self, name, attrs):
+        if name == field:
+            self.current_output = ''
+        self.current_output += '<' + name
+
+        if attrs.getLength() > 0:
+            for k in attrs.getNames():
+                self.current_output += ' ' + k + '=' + saxutils.quoteattr(attrs.getValue(k))
+        self.current_output += '>'
+
+    def characters(self, content):
+        self.current_output += saxutils.escape( content )
+
+    def endElement(self, name):
+        self.current_output += '</' + name + '>'
+        if name == field:
+            if re.match('^<' + field + '[ >]', self.current_output):
+                newRow = copy.deepcopy(r)
+                newRow['_raw'] = self.current_output
+                self.newResults.append(newRow)
+            self.current_output = ''
+
+try:
+    results,dummyresults,settings = splunk.Intersplunk.getOrganizedResults()
+
+    keywords, argvals = splunk.Intersplunk.getKeywordsAndOptions()
+
+    field = argvals.get("field", None)
+    if field is None:
+        raise Exception("Must supply name of field in field=fieldName")
+
+    newResults = []
+
+    handler = XmlHandler(field)
+
+    for r in results:
+        try:
+            if 'xml' in r:
+                xml_text = r['xml']
+            else:
+                raw = r["_raw"]
+                xml_text = raw[ raw.index( '<' ) : raw.rindex( '>' )+1 ]
+
+            handler.reset(newResults)
+            parser = xml.sax.make_parser()
+            parser.setContentHandler(handler)
+            parser.setEntityResolver(NullEntityResolver())
+            parser.parse(StringIO.StringIO(xml_text))
+        except:
+            import traceback
+            stack = traceback.format_exc()
+            r['_raw'] = "Failed to parse: " + str(stack) + r['_raw']
+            newResults = [r]
+
+except:
+    import traceback
+    stack =  traceback.format_exc()
+    newResults = splunk.Intersplunk.generateErrorResults("Error : Traceback: " + str(stack))
+
+splunk.Intersplunk.outputResults( newResults )
+
diff --git a/xmlutils/bin/xmlstripdeclaration.py b/xmlutils/bin/xmlstripdeclaration.py
new file mode 100755
index 0000000..a2623d2
--- /dev/null
+++ b/xmlutils/bin/xmlstripdeclaration.py
@@ -0,0 +1,34 @@
+# Copyright (C) 2010 Splunk Inc.  All Rights Reserved.  Version 4.0
+import splunk.Intersplunk
+
+
+try:
+    results,dummyresults,settings = splunk.Intersplunk.getOrganizedResults()
+
+    for r in results:
+        try:
+            if 'xml' in r:
+                xml_text = r['xml']
+                dest_field = 'xml'
+            else:
+                raw = r["_raw"]
+                dest_field = '_raw'
+
+                xml_text = raw[ raw.index( '<' ) : raw.rindex( '>' )+1 ]
+            if xml_text.startswith('<?'):
+                #remove the xml declaration. I know, I know, but I ran into a case where charset was wrong, and the parser explodes.
+                xml_text = xml_text[ raw.index( '<' , 5 ) : raw.rindex( '>' )+1 ]
+
+            r[dest_field] = xml_text
+
+        except:
+            import traceback
+            stack = traceback.format_exc()
+            r['_raw'] = "Failed to parse: " + str(stack) + r['_raw']
+
+except:
+    import traceback
+    stack =  traceback.format_exc()
+    results = splunk.Intersplunk.generateErrorResults("Error : Traceback: " + str(stack))
+
+splunk.Intersplunk.outputResults( results )
diff --git a/xmlutils/default/app.conf b/xmlutils/default/app.conf
new file mode 100644
index 0000000..dbdb044
--- /dev/null
+++ b/xmlutils/default/app.conf
@@ -0,0 +1,18 @@
+[launcher]
+version = 1.1
+author = vbumgarner
+description = XML utilities
+
+[package]
+id = xmlutils
+
+[install]
+state = enabled
+build = 2
+
+[ui]
+is_visible = false
+is_manageable = false
+label = xmlutils
+
+
diff --git a/xmlutils/default/commands.conf b/xmlutils/default/commands.conf
new file mode 100644
index 0000000..568a04a
--- /dev/null
+++ b/xmlutils/default/commands.conf
@@ -0,0 +1,27 @@
+[xmlkvrecursive]
+filename = xmlkvrecursive.py
+retainsevents = true
+overrides_timeorder = false
+streaming = true
+
+[xmlsplit]
+filename = xmlsplit.py
+retainsevents = true
+overrides_timeorder = false
+run_in_preview = false
+streaming = true
+
+[xmlprettyprint]
+filename = xmlprettyprint.py
+retainsevents = true
+overrides_timeorder = false
+run_in_preview = false
+streaming = true
+
+[xmlstripdeclaration]
+filename = xmlstripdeclaration.py
+retainsevents = true
+overrides_timeorder = false
+run_in_preview = false
+streaming = true
+
diff --git a/xmlutils/default/searchbnf.conf b/xmlutils/default/searchbnf.conf
new file mode 100644
index 0000000..6b3eacd
--- /dev/null
+++ b/xmlutils/default/searchbnf.conf
@@ -0,0 +1,66 @@
+[xmlkvrecursive-command]
+syntax = xmlkvrecursive (flatten=<bool>)?
+shortdesc = Builds fields recursively from xml.
+description = Given an xml document in either _raw or a field called xml, all cells are extracted into named fields. flatten determines whether fields with the same name create new fields or multivalue fields.
+default = xmlkvrecursive
+example1 = ... | xmlkvrecursive
+example2 = ... | xmlstripdeclaration | xmlkvrecursive flatten=true
+commentcheat = Builds fields recursively from xml.
+examplecheat = xmlkvrecursive (flatten=true)
+category = formatting
+maintainer = vbumgarner
+usage = public
+appears-in=4.1
+tags = xml kv
+related = xpath xmlprettyprint xmlsplit xmlstripdeclaration
+
+
+[xmlsplit-command]
+syntax = xmlsplit field=<string>
+shortdesc = Splits an xml into separate events by node.
+description = Given an xml document in either _raw or a field called xml, create an event for each node specified in field.
+default = xmlsplit field="field1"
+example1 = ... | xmlsplit field="event"
+example2 = ... | xmlstripdeclaration | xmlsplit field="event"
+commentcheat = Splits an xml into separate events by node.
+examplecheat = xmlsplit field="event"
+category = formatting
+maintainer = vbumgarner
+usage = public
+appears-in=4.1
+tags = xml split
+related = xpath xmlprettyprint xmlsplit xmlstripdeclaration
+
+
+[xmlprettyprint-command]
+syntax = xmlprettyprint
+shortdesc = Pretty prints xml.
+description = Given an xml document in either _raw or a field called xml, pretty print the xml and replace _raw.
+default = xmlprettyprint
+example1 = ... | xmlprettyprint
+example2 = ... | xmlstripdeclaration | xmlprettyprint
+commentcheat = Pretty prints xml.
+examplecheat = xmlprettyprint
+category = formatting
+maintainer = vbumgarner
+usage = public
+appears-in=4.1
+tags = xml pretty
+related = xpath xmlprettyprint xmlsplit xmlstripdeclaration
+
+[xmlstripdeclaration-command]
+syntax = xmlstripdeclaration
+shortdesc = Removes the xml declaration from the beginning of an xml document.
+description = Given an xml document in either _raw or a field called xml, remove the xml declaration, as it may cause a parsing error.
+default = xmlstripdeclaration
+example1 = ... | xmlstripdeclaration
+example2 = ... | xmlstripdeclaration | xmlprettyprint
+commentcheat = Removes the xml declaration from the beginning of an xml document.
+examplecheat = xmlstripdeclaration
+category = formatting
+maintainer = vbumgarner
+usage = public
+appears-in=4.1
+tags = xml
+related = xpath xmlprettyprint xmlsplit xmlstripdeclaration
+
diff --git a/xmlutils/metadata/default.meta b/xmlutils/metadata/default.meta
new file mode 100644
index 0000000..f9ecc07
--- /dev/null
+++ b/xmlutils/metadata/default.meta
@@ -0,0 +1,19 @@
+
+# Application-level permissions
+
+[]
+access = read : [ * ], write : [ admin, power ]
+
+[viewstates]
+access = read : [ * ], write : [ * ]
+
+[lookups]
+export = system
+
+[commands]
+export = system
+access = read : [ * ], write : [ admin ]
+
+[searchbnf]
+export = system
+
diff --git a/xmlutils/test/entity.csv b/xmlutils/test/entity.csv
new file mode 100644
index 0000000..cb8d569
--- /dev/null
+++ b/xmlutils/test/entity.csv
@@ -0,0 +1,10 @@
+junk header
+
+_raw,_time,yep
+"<?xml version=""1.0""?><!DOCTYPE x [<!ENTITY x SYSTEM ""file:///etc/issue"">]><x><b>foo &x;</b><b>bar</b><b>la</b></x>",12345,nope
+"<?xml version=""1.0""?><!DOCTYPE x [<!ENTITY x SYSTEM ""file:///etc/issue"">]><x><b>foo &x;</b><b>bar</b></x>",12345,not
+"<?xml version=""1.0""?><!DOCTYPE x [<!ENTITY x SYSTEM ""file:///etc/issue"">]><x><b>foo &x;</b><b>bar</b></x>",12345,today
+"<?xml version=""1.0""?><!DOCTYPE x [<!ENTITY x SYSTEM ""file:///etc/issue"">]><x><b>foo &x;</b><b>bar</b></x>",12345
+"<?xml version=""1.0""?><!DOCTYPE x [<!ENTITY x SYSTEM ""file:///etc/issue"">]><x><b>foo &x;</b><b>bar</b></x>",12345
+"<?xml version=""1.0""?><!DOCTYPE x [<!ENTITY x SYSTEM ""file:///etc/issue"">]><x><b>foo &x;</b><b>bar</b></x>",12345
+"<x><b>foo</b><b>bar<c f=""sdf"">ha</c></b></x>",12345