diff --git a/MongoDBtoMySQL.py b/MongoDBtoMySQL.py new file mode 100644 index 0000000..1ff8350 --- /dev/null +++ b/MongoDBtoMySQL.py @@ -0,0 +1,64 @@ +"Loads MongoDB to MySQL for queries" + + +from db import MySQL, MySQLConfig +from db import MySQLObject +from pymongo import MongoClient + +class MongoDBLoader: + def __init__(self): + """Set up connection.""" + print("Setting up connection...") + settings = {'MONGODB_SERVER': "localhost", + 'MONGODB_PORT': 27017, + 'MONGODB_DB': "ecosystem_mapping", + 'MONGODB_FILTERED_COLLECTION': "filtered_collection", + 'MONGODB_LINK_COLLECTION': "link_collection"} + connection = MongoClient( + settings['MONGODB_SERVER'], + settings['MONGODB_PORT'] + ) + self.db = connection[settings['MONGODB_DB']] + self.link_collection = self.db[settings['MONGODB_LINK_COLLECTION']] + self.filtered_collection = self.db[settings['MONGODB_FILTERED_COLLECTION']] + + def load_save(self): + """Loads in from MongoDB and saves to MySQL.""" + mysql = MySQL(config=MySQLConfig) + urls = list() + base_urls = self.link_collection.distinct("base_url") + for base_url in base_urls: + for data in self.link_collection.find({"base_url": base_url}): + src_url = data['src_url'] + dst_url = data['dst_url'] + urls.append(data) + MySQLObject(base_url=bytes(base_url, 'utf-8'), + src_url=bytes(src_url, 'utf-8'), + links = str(dst_url)).save() + print("adding %s to MySQL" % base_url) + +MongoDBLoader().load_save() + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/db/__init__.py b/db/__init__.py new file mode 100644 index 0000000..b0b5e11 --- /dev/null +++ b/db/__init__.py @@ -0,0 +1,33 @@ +from .config import * +from .models import * + +import sqlalchemy as sa +import sqlalchemy.orm as sao +import mongoengine as me + + +class MySQL(object): + """MySQL database connection abstraction""" + + def __init__(self, config): + """initialize connection""" + self.engine = sa.create_engine( + 'mysql+pymysql://{username}:{password}@{host}/{database}'.format( + username=config.username, + password=config.password, + host=config.host, + database=config.database)) + self.session = sao.scoped_session(sao.sessionmaker(bind=self.engine)) + + # set db to self + MySQLBase.db = self + + # extra MySQL initialization + MySQLBase.metadata.create_all(bind=self.engine) + +class Mongo(object): + """MongoDB database connection abstraction""" + + def __init__(self, config): + """initialize connection""" + self.db = me.connect(config.database) diff --git a/db/config.py b/db/config.py new file mode 100644 index 0000000..29dd81c --- /dev/null +++ b/db/config.py @@ -0,0 +1,19 @@ +""" +Configuration file for database connections +""" + +class MySQLConfig: + """configuration for MySQL""" + + username = 'root' + password = 'root' + host = 'localhost' + database = 'ecosystem_mapping' + + +class MongoConfig: + """configuration for MongoDB""" + + host = 'localhost' + port = '27017' + database = 'dbsamples' diff --git a/db/models.py b/db/models.py new file mode 100644 index 0000000..a617db1 --- /dev/null +++ b/db/models.py @@ -0,0 +1,67 @@ +from sqlalchemy.dialects import mysql +import sqlalchemy.ext.declarative as sad +import sqlalchemy as sa +import mongoengine as me + + +class Base(object): + """requirements for all objects""" + + + def save(self): + """save object in place""" + raise NotImplementedError() + + + def delete(self): + """delete object""" + raise NotImplementedError() + + +class MySQLBase(sad.declarative_base(), object): + """MySQL base object""" + + __abstract__ = True + db = None + + id = sa.Column(sa.Integer, primary_key=True) + + @classmethod + def objects(cls, give_query=False, **data): + query = cls.query().filter_by(**data) + return query if give_query else query.all() + + @classmethod + def query(cls): + """Returns query object""" + return cls.db.session.query(cls) + + def save(self): + """save object to database""" + self.db.session.add(self) + self.db.session.commit() + return self + + +class MySQLObject(MySQLBase): + """sample MySQL object""" + + __tablename__ = 'links' + + base_url = sa.Column(sa.String(100)) + src_url = sa.Column(mysql.BLOB()) + links = sa.Column(sa.Text, nullable = False) + +class MongoBase(me.Document): + """MongoDB base object""" + + meta = { + 'abstract': True + } + + +class MongoObject(MongoBase): + """sample MongoDB object""" + + name = me.StringField(maxlength=100) + email = me.StringField(maxlength=50, unique=True) diff --git a/preprocessor.py b/preprocessor.py index 6e45ab7..01318f4 100644 --- a/preprocessor.py +++ b/preprocessor.py @@ -39,8 +39,8 @@ def load_save(self): print("Processing URL: %s" % base_url) for data in self.html_collection.find({"base_url": base_url}): - source = data['src_url'] - # source = data['url'] + # source = data['src_url'] + source = data['url'] text = self.clean(data['body']) tier = data['tier'] time = data['timestamp'] @@ -90,19 +90,20 @@ def clean(self, text): return text def remove_named_entity(self, text): - _text = list() - for idx, sent in enumerate(nltk.sent_tokenize(text)): - for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = True): - #if hasattr(chunk, 'lab'): - if type(chunk) is not nltk.Tree: - word, pos = chunk - # if pos == " " for further removal - _text.append(word) - else: - #ne = ' '.join(c[0] for c in chunk.leaves()) - #self.named_entities.append(ne) - continue - return ' '.join(_text) + # _text = list() + # for idx, sent in enumerate(nltk.sent_tokenize(text)): + # for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = True): + # #if hasattr(chunk, 'lab'): + # if type(chunk) is not nltk.Tree: + # word, pos = chunk + # # if pos == " " for further removal + # _text.append(word) + # else: + # #ne = ' '.join(c[0] for c in chunk.leaves()) + # #self.named_entities.append(ne) + # continue + # return ' '.join(_text) + return text def remove_boilerplate(self, text): jtext = justext.justext(text, justext.get_stoplist("English"))