diff --git a/a.html b/a.html
new file mode 100644
index 0000000..240cdc3
--- /dev/null
+++ b/a.html
@@ -0,0 +1,1750 @@
+ return math.sqrt(sum/float(num-1))
+ return None
+def average(inList):
+ sum=0
+ for i in inList:
+ sum+=i
+ num=len(inList)
+ if num>0:
+ return sum/float(num)
+ return None
diff --git a/pageparser.py b/pageparser.py
new file mode 100644
index 0000000..73e0660
--- /dev/null
+++ b/pageparser.py
@@ -0,0 +1,228 @@
+# -- coding: utf-8
+from sgmllib import SGMLParser
+import fann,grubbs
+class ParseTag(object):
+ """ Class representing a tag which is parsed by the HTML parser(s) """
+ def __init__(self, tag, elmlist, enabled=True ,init=False):
+ self.tag = tag
+ self.elmlist = elmlist
+ self.enabled = enabled
+ self.init = init
+ def disable(self):
+ """ Disable parsing of this tag """
+ self.enabled = False
+ def enable(self):
+ """ Enable parsing of this tag """
+ self.enabled = True
+ def isEnabled(self):
+ """ Is this tag enabled ? """
+ return self.enabled
+ def __eq__(self,tag):
+ return self.tag.lower()==tag.lower()
+class RecordTag(object):
+ def __init__(self,tag,attrs,inme=True):
+ self.tag=tag
+ self.attrs=attrs
+ self.inMe=inme
+ self.data=''
+ self.parent=None
+ self.preSibling=None
+ self.nextSibling=None
+ self.density=0
+ self.children=[]
+ def calculate_density(self):
+ try:
+ total=0.0
+ for key,value in self.attrs:
+ total+=len(key)+len(value)+1
+ total+=len(self.tag)*2+5 #5=len('<>')+len('>')
+ dataLen=len(self.data)
+ self.density=dataLen/float(dataLen+total)
+ except Exception,e:
+ print e
+ def set_in_me(self,inme):
+ self.inMe=inme
+ def still_in_me(self):
+ return self.inMe
+ def add_data(self,data):
+ self.data+=data
+ def __str__(self):
+ return self.tag
+class DOMTree(list):
+ def __init__(self):
+ "lastRecTag : the last closed tag"
+ self.lastClosedRecTag=None
+ self.lastOpenRecTag=None
+ self.curTag=None
+ self.omitTags=['font','br','strong','b']
+ def get_siblings(self,recTag):
+ if recTag:
+ return [tag for tag in self.get_children(recTag.parent) if tag!=recTag]
+ return []
+ def get_children(self,recTag):
+ if recTag:
+ return recTag.children
+ return []
+ def get_last_open_tag(self):
+ try:
+ idx=-1
+ while not self[idx].still_in_me():
+ idx-=1
+ self.lastOpenRecTag=self[idx]
+ except IndexError:
+ pass
+ def start_tag(self,tag,attrs):
+ if tag in self.omitTags:
+ return
+ self.get_last_open_tag()
+ self.curTag=RecordTag(tag,attrs)
+ try:
+ preTag=self[-1]
+ self.curTag.parent=self.lastOpenRecTag
+ self.lastOpenRecTag.children.append(self.curTag)
+ if not preTag.still_in_me():
+ self.curTag.preSibling=self.lastClosedRecTag
+ self.lastClosedRecTag.nextSibling=self.curTag
+ except (AttributeError,IndexError):
+ pass
+ self.append(self.curTag)
+ def get_last_closed_tag(self):
+ try:
+ idx=-1
+ while not self[idx].still_in_me():
+ idx-=1
+ self.lastClosedRecTag=self[idx]
+ except IndexError:
+ pass
+ def end_tag(self,tag):
+ if tag in self.omitTags:
+ return
+ self.get_last_closed_tag()
+ self.lastClosedRecTag.set_in_me(False)
+ self.lastClosedRecTag.calculate_density()
+ def handle_data(self,data):
+ self.get_last_open_tag()
+ data=data.strip()
+ try:
+ #~ print ' handle data: ',data.strip(),' curTag:',self.curTag,' lastOpenTag: ',self.lastOpenRecTag # self.lastOpenRecTag,' ', self.lastClosedRecTag
+ if self.curTag.still_in_me():
+ self.curTag.add_data(data)
+ else:
+ self.lastOpenRecTag.add_data(data)
+ except AttributeError:
+ pass
+class SimpleParser(SGMLParser):
+ features = [ ParseTag('a', ['href']),
+ ParseTag('link', ['href']),
+ ParseTag('body', []),
+ ParseTag('title',[]),
+ ParseTag('script',[]),
+ ParseTag('style',[]),
+ ParseTag('meta', ['CONTENT', 'content',]),
+ ]
+ def __init__(self):
+ self.cream=''
+ self.domTree=DOMTree()
+ self.ann=fann.NeuNet()
+ self.ann.create_from_file("cream.net")
+ def unknown_starttag(self, tag, attrs):
+ if tag in self.features:
+ parsetag = self.features[self.features.index(tag)]
+ parsetag.init=True
+ self.domTree.start_tag(tag,attrs)
+ def unknown_endtag(self, tag):
+ if tag in self.features:
+ parsetag=self.features[self.features.index(tag)]
+ parsetag.init=False
+ self.domTree.end_tag(tag)
+ def handle_data(self, data):
+ if not self.features[self.features.index('style')].init \
+ and not self.features[self.features.index('script')].init:
+ self.domTree.handle_data(data)
+ def get_cream(self):
+ idx=0
+ bodyIdx=0
+ for rtag in self.domTree:
+ if rtag.tag=='body':
+ bodyIdx=idx
+ break
+ idx+=1
+ candidates={}
+ pos=0
+ for rtag in self.domTree[bodyIdx+1:]:
+ pos+=1
+ if rtag.tag in ['textarea']:
+ continue
+ ownDensity=rtag.density
+ if rtag.preSibling:
+ preDensity=rtag.preSibling.density
+ if preDensity==0.0 and rtag.preSibling.preSibling:
+ preDensity=rtag.preSibling.preSibling.density
+ else:
+ preDensity=0.0
+ if rtag.nextSibling:
+ nextDensity=rtag.nextSibling.density
+ if nextDensity==0.0 and rtag.nextSibling.nextSibling:
+ nextDensity=rtag.nextSibling.nextSibling.density
+ else:
+ nextDensity=0.0
+ # Load the data we described above.
+ calc_out=self.ann.run([len(rtag.data),ownDensity,preDensity,nextDensity])
+ if calc_out[0]>-0.7:
+ candidates[pos]=rtag
+ #~ print rtag.tag,' ',calc_out[0],' ',pos,' len:',len(rtag.data),' ',ownDensity,' ',preDensity,' ',nextDensity
+ #~ print "==============================="
+ #eleminate the tag that is far away from most of the tags
+ validTagKeys=grubbs.grubb_eleminate_outliers(candidates.keys())
+ validTagKeys.sort()
+ for key in validTagKeys:
+ print candidates[key].tag,' ',key,' ',candidates[key].data
+ def reset(self):
+ SGMLParser.reset(self)
+class CreamParser(SimpleParser):
+ """ A parser based on effbot's sgmlop """
+ def __init__(self):
+ # This module should be built already!
+ import sgmlop
+ self.parser = sgmlop.SGMLParser()
+ self.parser.register(self)
+ SimpleParser.__init__(self)
+ def finish_starttag(self, tag, attrs):
+ self.unknown_starttag(tag, attrs)
+ def finish_endtag(self, tag):
+ self.unknown_endtag(tag)
+ def feed(self, data):
+ self.parser.feed(data)
diff --git a/pytidy/__init__.py b/pytidy/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pytidy/pytidy.py b/pytidy/pytidy.py
new file mode 100644
index 0000000..3fa417b
--- /dev/null
+++ b/pytidy/pytidy.py
@@ -0,0 +1,53 @@
+# This file was automatically generated by SWIG (http://www.swig.org).
+# Version 1.3.36
+# Don't modify this file, modify the SWIG interface instead.
+# This file is compatible with both classic and new-style classes.
+import _pytidy
+import new
+new_instancemethod = new.instancemethod
+ _swig_property = property
+except NameError:
+ pass # Python < 2.2 doesn't have 'property'.
+def _swig_setattr_nondynamic(self,class_type,name,value,static=1):
+ if (name == "thisown"): return self.this.own(value)
+ if (name == "this"):
+ if type(value).__name__ == 'PySwigObject':
+ self.__dict__[name] = value
+ return
+ method = class_type.__swig_setmethods__.get(name,None)
+ if method: return method(self,value)
+ if (not static) or hasattr(self,name):
+ self.__dict__[name] = value
+ else:
+ raise AttributeError("You cannot add attributes to %s" % self)
+def _swig_setattr(self,class_type,name,value):
+ return _swig_setattr_nondynamic(self,class_type,name,value,0)
+def _swig_getattr(self,class_type,name):
+ if (name == "thisown"): return self.this.own()
+ method = class_type.__swig_getmethods__.get(name,None)
+ if method: return method(self)
+ raise AttributeError,name
+def _swig_repr(self):
+ try: strthis = "proxy of " + self.this.__repr__()
+ except: strthis = ""
+ return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,)
+import types
+ _object = types.ObjectType
+ _newclass = 1
+except AttributeError:
+ class _object : pass
+ _newclass = 0
+del types
+fix = _pytidy.fix
diff --git a/test_DOMTree.py b/test_DOMTree.py
new file mode 100644
index 0000000..3fef7bb
--- /dev/null
+++ b/test_DOMTree.py
@@ -0,0 +1,36 @@
+import pageparser,datamgr
+from pytidy import pytidy
+#~ print pytidy.fix("
+for rectag in pageparser.domTree:
+ print rectag.tag,': '
+ print ' parent: ',rectag.parent
+ print ' preSibling: ',rectag.preSibling
+ print ' nextSibling: ',rectag.nextSibling
+ print ' children: ',[child.tag for child in pageparser.domTree.get_children(rectag)]
+ print ' siblings: ',[sibling.tag for sibling in pageparser.domTree.get_siblings(rectag)]
+ print ' data: ',rectag.data,len(rectag.data)
+ print ' density: ',rectag.density
diff --git a/test_fann.py b/test_fann.py
new file mode 100644
index 0000000..f635126
--- /dev/null
+++ b/test_fann.py
@@ -0,0 +1,26 @@
+import fann
+from pyfann import libfann
+#~ ann=fann.NeuNet()
+#~ ann.create_from_file("cream.net")
+def test(l,res):
+ print "%s should be %s"%(ann.run(l),res)
+test([32, 0.42, 0, 0],-1)
+test_data = libfann.training_data()
+print "MSE error on test data: %f" % ann.get_MSE()
+#~ calc_out=ann.run([350,0.83,0.8])
+#~ print calc_out,' should be: ','True'
+#~ print ann.run([114,0.94,0.98,0.96]),[114,0.94,0.98,0.96]
+#~ print ann.run([38,0.7,0,0.0]),'should be: False'