Skip to content

Commit

Permalink
this should be considered v1.0
Browse files Browse the repository at this point in the history
- further refactoring, renaming, cleaning.
  • Loading branch information
aacid committed Jul 31, 2016
1 parent ecec97d commit 08ddb6b
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 58 deletions.
3 changes: 1 addition & 2 deletions Crawler4000/Crawler4000/Crawler4000.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from source.ConfigManager import ConfigManager
from source.DBManager import DBManager
from source.FBManager import FBManager
from source.FriendManager import FriendManager

class Crawler4000(object):
def __init__(self):
Expand All @@ -20,7 +19,7 @@ def initScraper(self):
return

if self.scraper.login(login, password):
#self.scraper.scrapeFriendsRecursively(1000)
self.scraper.scrapeFriendsRecursively(10)
self.scraper.scrapeProfiles()

diplo = Crawler4000()
61 changes: 23 additions & 38 deletions Crawler4000/Crawler4000/Source/DBManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
class DBManager(object):
"""manages all interaction with DB"""
DB_NAME = 'data.db'
tables = ['Profiles', 'Friends', 'PersonalInfo']
tables = ['Profiles', 'Friends', 'Details']
is_connected = False

def __init__(self):
Expand Down Expand Up @@ -37,20 +37,20 @@ def createTable(self, name):
);""")
elif name == 'Friends':
self.c.execute("""CREATE TABLE `Friends` (
`IdPerson` TEXT NOT NULL,
`IdProfile` TEXT NOT NULL,
`IdFriend` TEXT NOT NULL,
PRIMARY KEY(IdPerson,IdFriend),
FOREIGN KEY(`IdPerson`) REFERENCES Profiles(id),
PRIMARY KEY(IdProfile,IdFriend),
FOREIGN KEY(`IdProfile`) REFERENCES Profiles(id),
FOREIGN KEY(`IdFriend`) REFERENCES Profiles(id)
);""")
elif name == 'PersonalInfo':
self.c.execute("""CREATE TABLE `PersonalInfo` (
`IdPerson` TEXT NOT NULL,
elif name == 'Details':
self.c.execute("""CREATE TABLE `Details` (
`IdProfile` TEXT NOT NULL,
`Type` TEXT NOT NULL,
`Name` TEXT,
`Info` TEXT NOT NULL,
PRIMARY KEY(IdPerson,Type,Info),
FOREIGN KEY(`IdPerson`) REFERENCES Profiles(id)
PRIMARY KEY(IdProfile,Type,Info),
FOREIGN KEY(`IdProfile`) REFERENCES Profiles(id)
);""")

def checkConsistency(self):
Expand All @@ -72,64 +72,49 @@ def Commit(self):
if counter == 0:
print "DB Error: could not commit."

def addPerson(self, id, name):
def addProfile(self, id, name):
try:
self.c.execute("INSERT OR IGNORE INTO Profiles (id, Name) VALUES (?, ?)", (id, name))
except sqlite3.Error as er:
print 'AddPerson:', er.message

def addPersons(self, list):
query = "INSERT OR IGNORE INTO Profiles VALUES"
for person in list:
query += " (" + list[0] + ", " + list[1] + "),"
query = query[:-1] + ";"

try:
self.c.execute(query)
except sqlite3.Error as er:
print 'AddPersons:', er.message
print 'DB AddProfile:', er.message

def createConnection(self, id, friend):
try:
self.c.execute("INSERT OR IGNORE INTO Friends (IdPerson, IdFriend) VALUES (?, ?)", (id, friend))
self.c.execute("INSERT OR IGNORE INTO Friends (IdProfile, IdFriend) VALUES (?, ?)", (id, friend))
except sqlite3.Error as er:
print 'CreateConnection:', er.message
print 'DB CreateConnection:', er.message

def setPersonProfileScraped(self, id):
def setProfileScraped(self, id):
try:
self.c.execute("UPDATE Profiles SET ProfileScraped = ? WHERE id = ?", ('Y', id))
except sqlite3.Error as er:
print 'SetProfileScraped:', er.message
print 'DB SetProfileScraped:', er.message

def setPersonFriendsScraped(self, id):
def setFriendsScraped(self, id):
try:
self.c.execute("UPDATE Profiles SET FriendsScraped = ? WHERE id = ?", ('Y', id))
except sqlite3.Error as er:
print 'SetFriendsScraped:', er.message
print 'DB SetFriendsScraped:', er.message

def getPerson(self, id):
def getProfile(self, id):
self.c.execute("SELECT Name, ProfileScraped, FriendsScraped FROM Profiles WHERE id = ?", (id,))
result = self.c.fetchone()
return result

def getPersonInfo(self, id):
self.c.execute("SELECT * FROM PersonalInfo WHERE IdPerson = ?", (id,))
return self.c.fetchall()

def getPersonWithNoProfile(self):
def getProfileWithNoDetails(self):
self.c.execute("SELECT id FROM Profiles WHERE ProfileScraped = 'N' LIMIT 1")
result = self.c.fetchone()
return result[0]

def getPersonWithNoFriends(self):
def getProfileWithNoFriends(self):
self.c.execute("SELECT id FROM Profiles WHERE FriendsScraped = 'N' LIMIT 1")
result = self.c.fetchone()
return result[0]

def addPersonalInfo(self, person_id, list):
query = "INSERT OR IGNORE INTO PersonalInfo (IdPerson, Type, Name, Info) VALUES (?, ?, ?, ?)"
def addDetails(self, list):
query = "INSERT OR IGNORE INTO Details (IdProfile, Type, Name, Info) VALUES (?, ?, ?, ?)"

try:
self.c.executemany(query, list)
except sqlite3.Error as er:
print 'AddPersonalInfo:', er.message
print 'DB AddDetails:', er.message
28 changes: 15 additions & 13 deletions Crawler4000/Crawler4000/Source/FBManager.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import mechanize, re, sys
from bs4 import BeautifulSoup
from source.FriendManager import FriendManager
from source.FriendManager import FriendScraper
from source.Profile import Profile

class FBManager(object):
Expand Down Expand Up @@ -41,35 +41,37 @@ def login(self, fb_username, fb_password):
print 'Successfuly logged into Facebook as user ' + name + '.'


self.db.addPerson(username, name)
self.db.setPersonProfileScraped(username)
self.db.addProfile(username, name)
self.db.setProfileScraped(username)

return True

def scrapeFriendsRecursively(self, limit):
counter = 0
print 'Getting friends of first ' + str(limit) + ' profiles.'
while counter < limit:
person_id = self.db.getPersonWithNoFriends()
if person_id == None:
profile_id = self.db.getProfileWithNoFriends()
if profile_id == None:
return
counter += 1
self.scrapeFriends(person_id)
print 'Profile #' + str(counter) + ':'
self.scrapeFriends(profile_id)

def scrapeFriends(self, id):
person = FriendManager(id)
person.getFriends(self.browser)
person.save(self.db)
scraper = FriendScraper(id)
scraper.getFriends(self.browser)
scraper.save(self.db)

def scrapeProfiles(self):
counter = 0
while True:
person_id = self.db.getPersonWithNoProfile()
if person_id == None:
profile_id = self.db.getProfileWithNoDetails()
if profile_id == None:
return
counter += 1
profile = Profile.loadProfile(person_id, self.db)
profile = Profile.loadProfile(profile_id, self.db)
profile.scrapeProfile(self.browser)
profile.save(self.db)
self.db.setPersonProfileScraped(person_id)
self.db.setProfileScraped(profile_id)
self.db.Commit()

4 changes: 2 additions & 2 deletions Crawler4000/Crawler4000/Source/FriendManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from source.Profile import Profile
from bs4 import BeautifulSoup

class FriendManager(object):
class FriendScraper(object):
"""manages all profiles scraped"""

def __init__(self, id):
Expand Down Expand Up @@ -55,7 +55,7 @@ def getFriends(self, browser):
print "scraped " + str(counter) + ", could not scrape " + str(locked) + " profiles"

def save(self, db):
db.setPersonFriendsScraped(self.id)
db.setFriendsScraped(self.id)

print "saving " + str(len(self.profiles)) + " profiles"
if len(self.profiles) == 0:
Expand Down
6 changes: 3 additions & 3 deletions Crawler4000/Crawler4000/Source/Profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@ def __init__(self, id, name):

@staticmethod
def loadProfile(id, db):
name, profile_scraped, friends_scraped = db.getPerson(id)
name, profile_scraped, friends_scraped = db.getProfile(id)

profile = Profile(id, name)
profile.scraped = profile_scraped == 'Y'
profile.friends_scraped = friends_scraped == 'Y'
return profile

def save(self, db):
db.addPerson(self.id, self.name)
db.addProfile(self.id, self.name)

if self.scraped:
info_list = []
Expand All @@ -45,7 +45,7 @@ def save(self, db):
info_list.append(('BasicInfo', name, value))

list = [ (self.id,) + x for x in info_list ]
db.addPersonalInfo(self.id, list)
db.addDetails(list)
print "saved " + str(len(list)) + " details"


Expand Down

0 comments on commit 08ddb6b

Please sign in to comment.