Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

able to pull data from MongoDB to MySQL #6

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions MongoDBtoMySQL.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"Loads MongoDB to MySQL for queries"


from db import MySQL, MySQLConfig
from db import MySQLObject
from pymongo import MongoClient

class MongoDBLoader:
def __init__(self):
"""Set up connection."""
print("Setting up connection...")
settings = {'MONGODB_SERVER': "localhost",
'MONGODB_PORT': 27017,
'MONGODB_DB': "ecosystem_mapping",
'MONGODB_FILTERED_COLLECTION': "filtered_collection",
'MONGODB_LINK_COLLECTION': "link_collection"}
connection = MongoClient(
settings['MONGODB_SERVER'],
settings['MONGODB_PORT']
)
self.db = connection[settings['MONGODB_DB']]
self.link_collection = self.db[settings['MONGODB_LINK_COLLECTION']]
self.filtered_collection = self.db[settings['MONGODB_FILTERED_COLLECTION']]

def load_save(self):
"""Loads in from MongoDB and saves to MySQL."""
mysql = MySQL(config=MySQLConfig)
urls = list()
base_urls = self.link_collection.distinct("base_url")
for base_url in base_urls:
for data in self.link_collection.find({"base_url": base_url}):
src_url = data['src_url']
dst_url = data['dst_url']
urls.append(data)
MySQLObject(base_url=bytes(base_url, 'utf-8'),
src_url=bytes(src_url, 'utf-8'),
links = str(dst_url)).save()
print("adding %s to MySQL" % base_url)

MongoDBLoader().load_save()
























33 changes: 33 additions & 0 deletions db/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from .config import *
from .models import *

import sqlalchemy as sa
import sqlalchemy.orm as sao
import mongoengine as me


class MySQL(object):
"""MySQL database connection abstraction"""

def __init__(self, config):
"""initialize connection"""
self.engine = sa.create_engine(
'mysql+pymysql://{username}:{password}@{host}/{database}'.format(
username=config.username,
password=config.password,
host=config.host,
database=config.database))
self.session = sao.scoped_session(sao.sessionmaker(bind=self.engine))

# set db to self
MySQLBase.db = self

# extra MySQL initialization
MySQLBase.metadata.create_all(bind=self.engine)

class Mongo(object):
"""MongoDB database connection abstraction"""

def __init__(self, config):
"""initialize connection"""
self.db = me.connect(config.database)
19 changes: 19 additions & 0 deletions db/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""
Configuration file for database connections
"""

class MySQLConfig:
"""configuration for MySQL"""

username = 'root'
password = 'root'
host = 'localhost'
database = 'ecosystem_mapping'


class MongoConfig:
"""configuration for MongoDB"""

host = 'localhost'
port = '27017'
database = 'dbsamples'
67 changes: 67 additions & 0 deletions db/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from sqlalchemy.dialects import mysql
import sqlalchemy.ext.declarative as sad
import sqlalchemy as sa
import mongoengine as me


class Base(object):
"""requirements for all objects"""


def save(self):
"""save object in place"""
raise NotImplementedError()


def delete(self):
"""delete object"""
raise NotImplementedError()


class MySQLBase(sad.declarative_base(), object):
"""MySQL base object"""

__abstract__ = True
db = None

id = sa.Column(sa.Integer, primary_key=True)

@classmethod
def objects(cls, give_query=False, **data):
query = cls.query().filter_by(**data)
return query if give_query else query.all()

@classmethod
def query(cls):
"""Returns query object"""
return cls.db.session.query(cls)

def save(self):
"""save object to database"""
self.db.session.add(self)
self.db.session.commit()
return self


class MySQLObject(MySQLBase):
"""sample MySQL object"""

__tablename__ = 'links'

base_url = sa.Column(sa.String(100))
src_url = sa.Column(mysql.BLOB())
links = sa.Column(sa.Text, nullable = False)

class MongoBase(me.Document):
"""MongoDB base object"""

meta = {
'abstract': True
}


class MongoObject(MongoBase):
"""sample MongoDB object"""

name = me.StringField(maxlength=100)
email = me.StringField(maxlength=50, unique=True)
31 changes: 16 additions & 15 deletions preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ def load_save(self):
print("Processing URL: %s" % base_url)
for data in self.html_collection.find({"base_url": base_url}):

source = data['src_url']
# source = data['url']
# source = data['src_url']
source = data['url']
text = self.clean(data['body'])
tier = data['tier']
time = data['timestamp']
Expand Down Expand Up @@ -90,19 +90,20 @@ def clean(self, text):
return text

def remove_named_entity(self, text):
_text = list()
for idx, sent in enumerate(nltk.sent_tokenize(text)):
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = True):
#if hasattr(chunk, 'lab'):
if type(chunk) is not nltk.Tree:
word, pos = chunk
# if pos == " " for further removal
_text.append(word)
else:
#ne = ' '.join(c[0] for c in chunk.leaves())
#self.named_entities.append(ne)
continue
return ' '.join(_text)
# _text = list()
# for idx, sent in enumerate(nltk.sent_tokenize(text)):
# for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = True):
# #if hasattr(chunk, 'lab'):
# if type(chunk) is not nltk.Tree:
# word, pos = chunk
# # if pos == " " for further removal
# _text.append(word)
# else:
# #ne = ' '.join(c[0] for c in chunk.leaves())
# #self.named_entities.append(ne)
# continue
# return ' '.join(_text)
return text

def remove_boilerplate(self, text):
jtext = justext.justext(text, justext.get_stoplist("English"))
Expand Down