initial

tracyshen · Dec 16, 2009 · 2227828 · 2227828
commit 2227828
Show file tree

Hide file tree

Showing 12 changed files with 2,364 additions and 0 deletions.
diff --git a/a.html b/a.html
diff --git a/cream.data b/cream.data
@@ -0,0 +1,41 @@
+20 4 1
+270 0.97 0.0 0.0
+1
+353 0.98 0.0 0.98
+1
+100 0.92 0 0.6
+1
+426 0.98 0.98 0.97
+1
+292 0.97 0.98 0.98
+1
+276 0.97 0.98 0.0
+1
+154 0.95 0.97 0
+1
+114 0.94 0.98 0.96
+1
+0 0 0 0
+-1
+79 0 0 0
+-1
+10 0 0 0
+-1
+0 0 0.92 0.93
+-1
+0 0 0.31 0.07
+-1
+12 0.17 0 0
+-1
+32 0.42 0 0
+-1
+118 0.81 0 0
+-1
+123 0.82 0 0.31
+-1
+18 0.55 0 0.73
+-1
+34 0.33 0.23 0.82
+-1
+200 0.89 0.06 0
+-1
diff --git a/cream.net b/cream.net
@@ -0,0 +1,34 @@
+FANN_FLO_2.1
+num_layers=2
+learning_rate=0.700000
+connection_rate=1.000000
+network_type=0
+learning_momentum=0.000000
+training_algorithm=2
+train_error_function=1
+train_stop_function=0
+cascade_output_change_fraction=0.010000
+quickprop_decay=-0.000100
+quickprop_mu=1.750000
+rprop_increase_factor=1.200000
+rprop_decrease_factor=0.500000
+rprop_delta_min=0.000000
+rprop_delta_max=50.000000
+rprop_delta_zero=0.100000
+cascade_output_stagnation_epochs=12
+cascade_candidate_change_fraction=0.010000
+cascade_candidate_stagnation_epochs=12
+cascade_max_out_epochs=150
+cascade_max_cand_epochs=150
+cascade_num_candidate_groups=2
+bit_fail_limit=3.49999999999999977796e-01
+cascade_candidate_limit=1.00000000000000000000e+03
+cascade_weight_multiplier=4.00000000000000022204e-01
+cascade_activation_functions_count=10
+cascade_activation_functions=3 5 7 8 10 11 14 15 16 17 
+cascade_activation_steepnesses_count=4
+cascade_activation_steepnesses=2.50000000000000000000e-01 5.00000000000000000000e-01 7.50000000000000000000e-01 1.00000000000000000000e+00 
+layer_sizes=5 2 
+scale_included=0
+neurons (num_inputs, activation_function, activation_steepness)=(0, 0, 0.00000000000000000000e+00) (0, 0, 0.00000000000000000000e+00) (0, 0, 0.00000000000000000000e+00) (0, 0, 0.00000000000000000000e+00) (0, 0, 0.00000000000000000000e+00) (5, 6, 5.00000000000000000000e-01) (0, 6, 0.00000000000000000000e+00) 
+connections (connected_to_neuron, weight)=(0, 1.16192409060151324862e-01) (1, 2.55974968703720300311e+01) (2, 1.46277055544460896641e+01) (3, 3.51973527701262511869e+01) (4, -5.13725333978066203144e+01) 
diff --git a/creamer.py b/creamer.py
@@ -0,0 +1,14 @@
+#coding:utf-8
+from urllib import urlopen
+import pageparser,datamgr
+
+url='http://house.focus.cn/showarticle/1911/572831.html'
+rawContent=datamgr.to_utf8(urlopen(url).read())
+pageparser=pageparser.CreamParser()	
+pageparser.feed(rawContent)
+#~ print 'url: ',url
+#~ print 'title: ',pageparser.spot.title
+#~ print 'keywords: ',pageparser.spot.keywords
+#~ print 'body data: ',pageparser.bdata
+pageparser.get_cream()
+#~ print pageparser.cream
diff --git a/datamgr.py b/datamgr.py
@@ -0,0 +1,69 @@
+#coding:utf-8
+from os import path
+from types import *
+import chardet
+class Spot(object):
+	def __init__(self,url,title='',keywords='',timestamp='',literal=''):
+		self.url=url
+		self.title=title
+		self.keywords=keywords
+		self.literal=literal
+		self.timestamp=timestamp
+		self.scream=None
+
+	def set_scream(self,scream):
+		self.scream=scream
+	#~ def __str__(self):
+		#~ return self.url
+	#~ def __eq__(self,item):
+		#~ return self.url==str(item).lower()
+
+class CaselessDict(dict):
+
+    def __init__(self, mapping=None):
+        if mapping:
+            if type(mapping) is dict:
+                for k,v in d.items():
+                    self.__setitem__(k, v)
+            elif type(mapping) in (list, tuple):
+                d = dict(mapping)
+                for k,v in d.items():
+                    self.__setitem__(k, v)
+
+        # super(CaselessDict, self).__init__(d)
+
+    def __setitem__(self, name, value):
+
+        if type(name) in StringTypes:
+            super(CaselessDict, self).__setitem__(name.lower(), value)
+        else:
+            super(CaselessDict, self).__setitem__(name, value)
+
+    def __getitem__(self, name):
+        if type(name) in StringTypes:
+            return super(CaselessDict, self).__getitem__(name.lower())
+        else:
+            return super(CaselessDict, self).__getitem__(name)
+
+    def __copy__(self):
+        pass
+
+def to_utf8(data,sencoding=None):
+    if sencoding:
+	try:
+	    return data.decode(sencoding).encode('utf-8')
+	except Exception,e:
+	    pass
+
+    try:
+	return data.decode('GBK18030').encode('utf-8')
+    except Exception,e:
+	try:
+	    return data.decode('GBK').encode('utf-8')
+	except Exception,e:		
+	    try:
+		sencoding=chardet.detect(data)['encoding']
+		return data.decode(sencoding).encode('utf-8')
+	    except Exception,e:
+		return data
+
diff --git a/fann.py b/fann.py
@@ -0,0 +1,32 @@
+from pyfann.libfann import neural_net,SIGMOID_SYMMETRIC_STEPWISE
+
+connectionRate = 1
+learningRate = 0.7
+neuronsHiddenNum = 4
+
+desiredError = 0.00005
+maxIterations = 100000
+iterationsBetweenReports = 1000
+inNum=4
+outNum=1
+class NeuNet(neural_net):
+	def __init__(self):
+		neural_net.__init__(self)
+		#~ neural_net.create_sparse_array(self,connectionRate,(inNum,neuronsHiddenNum, outNum))
+		neural_net.create_standard_array(self,(inNum,outNum))
+		neural_net.set_learning_rate(self,learningRate)
+		neural_net.set_activation_function_output(self,SIGMOID_SYMMETRIC_STEPWISE)
+
+	def train_on_file(self,fileName):
+		neural_net.train_on_file(self,fileName,maxIterations,iterationsBetweenReports,desiredError)
+
+	#~ def 
+#~ ann = libfann.neural_net()
+#~ ann.create_sparse_array(connection_rate, (num_input, num_neurons_hidden, num_output))
+#~ ann.set_learning_rate(learning_rate)
+#~ ann.set_activation_function_output(libfann.SIGMOID_SYMMETRIC_STEPWISE)
+
+#~ ann.train_on_file("../../examples/xor.data", max_iterations, iterations_between_reports, desired_error)
+
+#~ ann.save("xor_float.net")
+
diff --git a/grubbs.py b/grubbs.py
@@ -0,0 +1,81 @@
+import math
+GrubbsRatio={3:[1.15,1.16],
+			4:[1.46,1.49],
+			5:[1.67,1.75],
+			6:[1.82,1.94],
+			7:[1.94,2.10],
+			8:[2.03,2.22],
+			9:[2.11,2.32],
+			10:[2.18,2.41],
+			11:[2.23,2.48],
+			12:[2.28,2.55],
+			13:[2.33,2.61],
+			14:[2.37,2.66],
+			15:[2.41,2.70],
+			16:[2.44,2.75],
+			17:[2.48,2.78],
+			18:[2.50,2.82],
+			19:[2.53,2.85],
+			20:[2.56,2.88],
+			21:[2.58,2.91],
+			22:[2.60,2.94],
+			23:[2.62,2.96],
+			24:[2.64,2.99],
+			25:[2.66,3.01],
+			#the following data were not ganranteed to be true:
+			26:[2.68,3.03],
+			27:[2.70,3.05],
+			28:[2.72,3.07],
+			29:[2.73,3.09],
+			30:[2.74,3.10],
+			}
+
+def grubb_eleminate_outliers(rawList,a=0.05):
+	if a==0.05:
+		idx=0
+	else:
+		idx=1
+	count=len(rawList)
+	if count<=2 or count>30:
+		return rawList
+	ave=average(rawList)
+	variance=get_variance(rawList,ave)
+	newList=[]
+	for i in rawList:
+		if math.fabs((ave-i)/float(variance))<GrubbsRatio[count][idx]:
+			newList.append(i)
+	return newList
+
+def get_variance(inList,ave):
+	sum=0
+	for i in inList:
+		var=i-ave
+		sum+=var*var
+	num=len(inList)
+	if num>1:
+		return math.sqrt(sum/float(num-1))
+	return None
+
+def average(inList):
+	sum=0
+	for i in inList:
+		sum+=i
+	num=len(inList)
+	if num>0:
+		return sum/float(num)
+	return None
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+