re_maxmatch.py

#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
import string
import random
import math
import sys

def main():
	#reload(sys)
	#sys.setdefaultencoding('utf-8')
	#sys.setdefaultdecoding('utf-8')

#create an list for storage all file'words
	words = []
	newsclass = ['auto','business','sports']
	del_estr = string.punctuation + string.digits + '\n' #string.whitespace
	del_cstr = "，。"
#	identify = string.maketrans('','')
	article_num = 0
	words_num=0
	
	# read dictionary
	dict = [ ]
	dict_file = open('lexicon.txt', 'r')
	#dict_file = file('lexicon.txt', 'r')
	#tmp_word = dict_file.readline()
	content = dict_file.read().strip()
	#content = content.decode('utf-8')	
	words = content.replace('\n',' ').split(' ')
	#while tmp_word != '':
		#print tmp_word
	dict.extend(words) #.strip(' \n'))
		#tmp_word = dict_file.readline()	
	#dict_file.close()

	print(len(dict))
	#for w in dict:
	#	print w.encode('utf-8')
	#print(dict)
	#if '中国' in dict:
	#	print '中国'	
	# read original file
	#orig_file = open('RenMinData.txt','r')
	#lines = orig_file.readlines()
	#orig_file.close()
	#orig_file#print len(lines)
	#contents = []
	sentence = input("Input a sentence: ")
	print("You typed '%s'" % (sentence))
	#string.capwords(sentence)
	#print(sentence)
	print(len(sentence))
	#for ww in sentence:
	#	print ww
	#sentence.expand('\n')
	#print type(sentence)
	#sentence = sentence.encode('ascii').decode('utf-8')
	#print sentence
	#contents.append(inputs) #.strip(' \n'))
	#tmp = contents[-4*3:]
	#print tmp
	#print contents
	result = []
	match = 0
	while sentence != "":
		#tmp = contents[-4*3:]
		match = 0
		for i in range(-4,0):
			tmp = sentence[i:]
			for it in dict:
				#print '------------------------------------------'
				#print list(it)
				#print list(tmp)
				if it.strip('\n') == tmp:
					#print tmp
					#print it
					result.insert(0,tmp)
					#del contents[i*3:]
					sentence = sentence[:i]   #.strip(tmp)
					match = 1
					#print contents
					break;
			if match == 1:
				break
			else:
				continue
		
		#print contents
		#tmp = contents[i*3:]
		#break;
	for w in result:
		print(w)


___comment___ = """

for root, dirs, files in os.walk('data'):
		for file_name in files: 
			#print file_name
			file_obj = open(os.path.join('data',file_name), 'r')
			article_num += 1
			line = file_obj.readline()
			while line != "":
				#print line
				#ll = line.strip(' \n').split(' ')
				ll = line.translate(identify, del_estr)
				#ll = ll.translate(identify, del_cstr)
				ll = ll.split(' ')
				for w in ll:
					#print w
					if w not in words:
						words.append(w)
				line = file_obj.readline()
			file_obj.close()
			#break;
		words_num = len(words)
		print words_num
		print article_num
	
#read all news to matric raw_data[N][M]

	an_article = [0] * words_num
	n = 0
	#raw_data = [article_num][words_num]
	raw_data = []
	for root, dirs, files in os.walk('data'):
		for file_name in files:
			file_obj = open(os.path.join('data',file_name), 'r')
			line = file_obj.readline()
			while line != "":
				ll = line.translate(identify, del_estr)
				ll = line.split(' ')
				for w in ll:
					if w in words:
						#if w not in an_article:
						#an_article.append(words.index(w))
						#print words.index(w)
						#print an_article
						an_article[words.index(w)]=1
				line = file_obj.readline()
			raw_data.append(an_article)
			#an_article[n] = an_article	
			file_obj.close()
			n += 1
			#break;
		print len(raw_data)
		print len(raw_data[0])
		#print raw_data
		#print raw_data[len(raw_data)-1]
	

#create k number of random vectors, record to classcenter[K][M]
	k = random.randint(2, 6)
	classcenter = [0]*k
	kk = 0
	for kk in range(k):
		classcenter.append(raw_data[random.randint(0,article_number)])	
	print classcenter
#for each articles, calculate it's L2 distance and it's MIN value, record to class[N] 
# L2 distance =  sum[(Ai-Aj)*(Ai-Aj)], Ai and Bj is raw_data[i] and raw_data[j], i!=j
	dataclass = [0]*article_num
	tmp = [0]*k
	sum = 0
	dist = 0
	for l in range(article_num):
		for m in range(k):
			for n in range(words_num):
				sum += pow((raw_data[l][n]-classcentet[m][n]),2)
			dist = int(math.sqrt(sum))
			tmp[m] = dist
			sum=0
		dataclass[l] = tmp.index(min(tmp)) + 1
	print dataclass	

#update classcenter[K][M]

#if class[N] (old) == class[N] (new), It's OK! :)
"""
main ()