diff --git a/a.html b/a.html
new file mode 100644
index 0000000..240cdc3
--- /dev/null
+++ b/a.html
@@ -0,0 +1,1750 @@
+
+
+
+1:
+ return math.sqrt(sum/float(num-1))
+ return None
+
+def average(inList):
+ sum=0
+ for i in inList:
+ sum+=i
+ num=len(inList)
+ if num>0:
+ return sum/float(num)
+ return None
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/pageparser.py b/pageparser.py
new file mode 100644
index 0000000..73e0660
--- /dev/null
+++ b/pageparser.py
@@ -0,0 +1,228 @@
+# -- coding: utf-8
+from sgmllib import SGMLParser
+import fann,grubbs
+
+class ParseTag(object):
+ """ Class representing a tag which is parsed by the HTML parser(s) """
+
+ def __init__(self, tag, elmlist, enabled=True ,init=False):
+ self.tag = tag
+ self.elmlist = elmlist
+ self.enabled = enabled
+ self.init = init
+
+ def disable(self):
+ """ Disable parsing of this tag """
+ self.enabled = False
+
+ def enable(self):
+ """ Enable parsing of this tag """
+ self.enabled = True
+
+ def isEnabled(self):
+ """ Is this tag enabled ? """
+ return self.enabled
+
+ def __eq__(self,tag):
+ return self.tag.lower()==tag.lower()
+
+
+class RecordTag(object):
+
+ def __init__(self,tag,attrs,inme=True):
+ self.tag=tag
+ self.attrs=attrs
+ self.inMe=inme
+ self.data=''
+ self.parent=None
+ self.preSibling=None
+ self.nextSibling=None
+ self.density=0
+ self.children=[]
+
+ def calculate_density(self):
+ try:
+ total=0.0
+ for key,value in self.attrs:
+ total+=len(key)+len(value)+1
+ total+=len(self.tag)*2+5 #5=len('<>')+len('>')
+ dataLen=len(self.data)
+ self.density=dataLen/float(dataLen+total)
+ except Exception,e:
+ print e
+
+ def set_in_me(self,inme):
+ self.inMe=inme
+
+ def still_in_me(self):
+ return self.inMe
+
+ def add_data(self,data):
+ self.data+=data
+
+ def __str__(self):
+ return self.tag
+
+class DOMTree(list):
+ def __init__(self):
+ "lastRecTag : the last closed tag"
+ self.lastClosedRecTag=None
+ self.lastOpenRecTag=None
+ self.curTag=None
+ self.omitTags=['font','br','strong','b']
+
+ def get_siblings(self,recTag):
+ if recTag:
+ return [tag for tag in self.get_children(recTag.parent) if tag!=recTag]
+ return []
+
+ def get_children(self,recTag):
+ if recTag:
+ return recTag.children
+ return []
+
+ def get_last_open_tag(self):
+ try:
+ idx=-1
+ while not self[idx].still_in_me():
+ idx-=1
+ self.lastOpenRecTag=self[idx]
+ except IndexError:
+ pass
+
+ def start_tag(self,tag,attrs):
+ if tag in self.omitTags:
+ return
+ self.get_last_open_tag()
+ self.curTag=RecordTag(tag,attrs)
+ try:
+ preTag=self[-1]
+ self.curTag.parent=self.lastOpenRecTag
+ self.lastOpenRecTag.children.append(self.curTag)
+ if not preTag.still_in_me():
+ self.curTag.preSibling=self.lastClosedRecTag
+ self.lastClosedRecTag.nextSibling=self.curTag
+ except (AttributeError,IndexError):
+ pass
+ self.append(self.curTag)
+
+ def get_last_closed_tag(self):
+ try:
+ idx=-1
+ while not self[idx].still_in_me():
+ idx-=1
+ self.lastClosedRecTag=self[idx]
+ except IndexError:
+ pass
+
+ def end_tag(self,tag):
+ if tag in self.omitTags:
+ return
+ self.get_last_closed_tag()
+ self.lastClosedRecTag.set_in_me(False)
+ self.lastClosedRecTag.calculate_density()
+
+ def handle_data(self,data):
+ self.get_last_open_tag()
+ data=data.strip()
+ try:
+ #~ print ' handle data: ',data.strip(),' curTag:',self.curTag,' lastOpenTag: ',self.lastOpenRecTag # self.lastOpenRecTag,' ', self.lastClosedRecTag
+ if self.curTag.still_in_me():
+ self.curTag.add_data(data)
+ else:
+ self.lastOpenRecTag.add_data(data)
+ except AttributeError:
+ pass
+
+class SimpleParser(SGMLParser):
+ features = [ ParseTag('a', ['href']),
+ ParseTag('link', ['href']),
+ ParseTag('body', []),
+ ParseTag('title',[]),
+ ParseTag('script',[]),
+ ParseTag('style',[]),
+ ParseTag('meta', ['CONTENT', 'content',]),
+ ]
+ def __init__(self):
+ self.cream=''
+ self.domTree=DOMTree()
+ self.ann=fann.NeuNet()
+ self.ann.create_from_file("cream.net")
+
+ def unknown_starttag(self, tag, attrs):
+ if tag in self.features:
+ parsetag = self.features[self.features.index(tag)]
+ parsetag.init=True
+ self.domTree.start_tag(tag,attrs)
+
+ def unknown_endtag(self, tag):
+ if tag in self.features:
+ parsetag=self.features[self.features.index(tag)]
+ parsetag.init=False
+ self.domTree.end_tag(tag)
+
+ def handle_data(self, data):
+ if not self.features[self.features.index('style')].init \
+ and not self.features[self.features.index('script')].init:
+ self.domTree.handle_data(data)
+
+ def get_cream(self):
+ idx=0
+ bodyIdx=0
+ for rtag in self.domTree:
+ if rtag.tag=='body':
+ bodyIdx=idx
+ break
+ idx+=1
+ candidates={}
+ pos=0
+ for rtag in self.domTree[bodyIdx+1:]:
+ pos+=1
+ if rtag.tag in ['textarea']:
+ continue
+ ownDensity=rtag.density
+ if rtag.preSibling:
+ preDensity=rtag.preSibling.density
+ if preDensity==0.0 and rtag.preSibling.preSibling:
+ preDensity=rtag.preSibling.preSibling.density
+ else:
+ preDensity=0.0
+ if rtag.nextSibling:
+ nextDensity=rtag.nextSibling.density
+ if nextDensity==0.0 and rtag.nextSibling.nextSibling:
+ nextDensity=rtag.nextSibling.nextSibling.density
+ else:
+ nextDensity=0.0
+ # Load the data we described above.
+ calc_out=self.ann.run([len(rtag.data),ownDensity,preDensity,nextDensity])
+ if calc_out[0]>-0.7:
+ candidates[pos]=rtag
+ #~ print rtag.tag,' ',calc_out[0],' ',pos,' len:',len(rtag.data),' ',ownDensity,' ',preDensity,' ',nextDensity
+ #~ print "==============================="
+ #eleminate the tag that is far away from most of the tags
+ validTagKeys=grubbs.grubb_eleminate_outliers(candidates.keys())
+ validTagKeys.sort()
+ for key in validTagKeys:
+ print candidates[key].tag,' ',key,' ',candidates[key].data
+
+ def reset(self):
+ SGMLParser.reset(self)
+
+class CreamParser(SimpleParser):
+ """ A parser based on effbot's sgmlop """
+
+ def __init__(self):
+ # This module should be built already!
+ import sgmlop
+ self.parser = sgmlop.SGMLParser()
+ self.parser.register(self)
+ SimpleParser.__init__(self)
+
+ def finish_starttag(self, tag, attrs):
+ self.unknown_starttag(tag, attrs)
+
+ def finish_endtag(self, tag):
+ self.unknown_endtag(tag)
+
+ def feed(self, data):
+ self.parser.feed(data)
diff --git a/pytidy/__init__.py b/pytidy/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pytidy/pytidy.py b/pytidy/pytidy.py
new file mode 100644
index 0000000..3fa417b
--- /dev/null
+++ b/pytidy/pytidy.py
@@ -0,0 +1,53 @@
+# This file was automatically generated by SWIG (http://www.swig.org).
+# Version 1.3.36
+#
+# Don't modify this file, modify the SWIG interface instead.
+# This file is compatible with both classic and new-style classes.
+
+import _pytidy
+import new
+new_instancemethod = new.instancemethod
+try:
+ _swig_property = property
+except NameError:
+ pass # Python < 2.2 doesn't have 'property'.
+def _swig_setattr_nondynamic(self,class_type,name,value,static=1):
+ if (name == "thisown"): return self.this.own(value)
+ if (name == "this"):
+ if type(value).__name__ == 'PySwigObject':
+ self.__dict__[name] = value
+ return
+ method = class_type.__swig_setmethods__.get(name,None)
+ if method: return method(self,value)
+ if (not static) or hasattr(self,name):
+ self.__dict__[name] = value
+ else:
+ raise AttributeError("You cannot add attributes to %s" % self)
+
+def _swig_setattr(self,class_type,name,value):
+ return _swig_setattr_nondynamic(self,class_type,name,value,0)
+
+def _swig_getattr(self,class_type,name):
+ if (name == "thisown"): return self.this.own()
+ method = class_type.__swig_getmethods__.get(name,None)
+ if method: return method(self)
+ raise AttributeError,name
+
+def _swig_repr(self):
+ try: strthis = "proxy of " + self.this.__repr__()
+ except: strthis = ""
+ return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,)
+
+import types
+try:
+ _object = types.ObjectType
+ _newclass = 1
+except AttributeError:
+ class _object : pass
+ _newclass = 0
+del types
+
+
+fix = _pytidy.fix
+
+
diff --git a/test_DOMTree.py b/test_DOMTree.py
new file mode 100644
index 0000000..3fef7bb
--- /dev/null
+++ b/test_DOMTree.py
@@ -0,0 +1,36 @@
+#coding:utf-8
+import pageparser,datamgr
+from pytidy import pytidy
+
+url='http://blog.qq.com/qzone/41533848/1260352786.htm'
+spotObj=datamgr.Spot(url)
+
+rawContent="""
+
+
保障性住房缘何多是非
+
+
保障性住房是指政府为中低收入住房困难家庭所提供的限定标准、限定价格或租金的住房,由廉租住房、经济适用住房和政策性租赁住房构成。自从保障性住房推出之后,一直是非不断,以丑闻居多。
+
+
最近,武汉经适房“六连号”、郑州“经适房建别墅事件”等案例暴露出了保障性住房制度上的漏洞,更重要的是将政府执行部门的公信度降低到了极点。许多经济适用房被不符合条件的人占有,成为一些人合法吞噬低收入者福利的一种途径。经济适用房作为一种公共福利,是政府兴建、政府分配,政府成为直接主体,经济适用房的分配不公,使许多真正的中低收入者对于购置保障性住房失去了希望,社会影响极坏。
+
+
+
造成这样丑闻的原因主要是由于保障性住房的资源过于紧张,中低收入者庞大的需求量与紧张的房源之间不成比例,加之保障性住房的价格与市场上普通的商品房之间价格也有着较大的差异,这就使一些有着“投机思想”和“特权主义”的人费尽心思去徇私舞弊。
+
+
保障性住房成“鸡肋”房
+
+"""
+pageparser=pageparser.HMSGMLOpParser(spotObj)
+pageparser.feed(rawContent)
+pageparser.get_cream()
+
+#~ print pytidy.fix("
")
+for rectag in pageparser.domTree:
+ print rectag.tag,': '
+ print ' parent: ',rectag.parent
+ print ' preSibling: ',rectag.preSibling
+ print ' nextSibling: ',rectag.nextSibling
+ print ' children: ',[child.tag for child in pageparser.domTree.get_children(rectag)]
+ print ' siblings: ',[sibling.tag for sibling in pageparser.domTree.get_siblings(rectag)]
+ print ' data: ',rectag.data,len(rectag.data)
+ print ' density: ',rectag.density
+
diff --git a/test_fann.py b/test_fann.py
new file mode 100644
index 0000000..f635126
--- /dev/null
+++ b/test_fann.py
@@ -0,0 +1,26 @@
+import fann
+from pyfann import libfann
+
+ann=fann.NeuNet()
+ann.train_on_file("cream.data")
+ann.save("cream.net")
+
+#~ ann=fann.NeuNet()
+#~ ann.create_from_file("cream.net")
+
+def test(l,res):
+ print "%s should be %s"%(ann.run(l),res)
+
+test([350,0.83,0.8],True)
+test([114,0.94,0.98,0.96],True)
+test([38,0.7,0,0.0],False)
+test([32, 0.42, 0, 0],-1)
+test_data = libfann.training_data()
+test_data.read_train_from_file("cream.data")
+ann.test_data(test_data)
+print "MSE error on test data: %f" % ann.get_MSE()
+
+#~ calc_out=ann.run([350,0.83,0.8])
+#~ print calc_out,' should be: ','True'
+#~ print ann.run([114,0.94,0.98,0.96]),[114,0.94,0.98,0.96]
+#~ print ann.run([38,0.7,0,0.0]),'should be: False'