Skip to content

Commit

Permalink
Update: Added cosine similarity
Browse files Browse the repository at this point in the history
Signed-off-by: Preetham Kamidi <[email protected]>
  • Loading branch information
preetham committed Jun 12, 2019
1 parent 4875c80 commit 66c41b7
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 20 deletions.
8 changes: 7 additions & 1 deletion app/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,17 @@ class Config(object):
WORKER_CLASS = os.getenv('WORKER_CLASS')
FILE_DIRECTORY = os.getenv('FILE_DIRECTORY')
ALLOWED_EXTENSIONS = set(os.getenv('ALLOWED_EXTENSIONS').split(','))
SIMILARITY_THRESHOLD = float(os.getenv('SIMILARITY_THRESHOLD'))
TWITTER_HOSTNAME = os.getenv('TWITTER_HOSTNAME')
TWITTER_APIVER = os.getenv('TWITTER_APIVER')
TWITTER_CONTEXT = os.getenv('TWITTER_CONTEXT')
TWITTER_ACCESSTOKEN = os.getenv('TWITTER_ACCESSTOKEN')
TWEET_COUNT = os.getenv('TWEET_COUNT')
TWEET_COUNT = int(os.getenv('TWEET_COUNT'))
TWEET_DATE_KEY = os.getenv('TWEET_DATE_KEY')
TWEET_USERNAME_KEY = os.getenv('TWEET_USERNAME_KEY')
TWEET_COUNT_KEY = os.getenv('TWEET_COUNT_KEY')
TWEET_MAX_OLD = int(os.getenv('TWEET_MAX_OLD'))
TWEET_TEXT_KEY = os.getenv('TWEET_TEXT_KEY')

app_config = Config()

10 changes: 8 additions & 2 deletions app/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@

from app.logger.logger import logger
from app.services.image import uploader, processor
from app.services.nlp.processor import get_entities
from app.services.nlp.processor import get_entities, get_similarity
from app.services.search.controller import search_controller
from app.util.date_checker import valid
from app.config.config import app_config

app_router = Flask(__name__, static_folder=app_config.FILE_DIRECTORY)
Expand All @@ -30,7 +32,11 @@ def get_text_from_image():
logger.info('Processed text: ' + text)
entities = get_entities(text)
logger.info('Entities: ' + str(entities))
if not entities['date'] or not valid(entities['date']):
return "Date of Tweet too old", 400
same_day_tweets = search_controller(entities)
similarity = get_similarity(entities['tweet'], same_day_tweets)
return jsonify({
'status': 0,
'result': entities
'result': similarity
})
43 changes: 33 additions & 10 deletions app/services/nlp/processor.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,54 @@
from re import search
from datetime import timezone
from dateutil.parser import parse

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from app.logger.logger import logger
from app.config.config import app_config


count_vectorizer = CountVectorizer()

def get_entities(text: str):
if not text:
return {}
logger.info('Parsing data out of processed text...')
username_match = search(r'@(\w{1,15})\b', text)
date_match = search(r'\d{1,2}\s\w+\s\d{4}', text)
if not username_match or not date_match:
datetime_match = search(
r'((1[0-2]|0?[1-9]):([0-5][0-9]) ?([AaPp][Mm]))\s-\s\d{1,2}\s\w+\s\d{4}', text)
if not username_match or not datetime_match:
return {
'user_id': None,
'tweet': None,
'datetime': None
}
user_id = username_match.group()
date_str = date_match.group()
date = parse(date_str)
formatted_date = date.strftime('%Y-%m-%d')
user_id = username_match.group()[1:]
date_str = datetime_match.group().replace('-', '')
processed_datetime = parse(date_str).replace(tzinfo=timezone.utc)
username_end_index = username_match.end()
date_start_index = date_match.start()
tweet = text[username_end_index+5:date_start_index-10]
date_start_index = datetime_match.start()
tweet = text[username_end_index+5:date_start_index].strip()
return {
'user_id': user_id,
'tweet': tweet,
'date': formatted_date
'date': processed_datetime
}


def get_similarity(processed_tweet:str, same_day_tweets:list):
if not processed_tweet or not same_day_tweets:
return []
logger.info('Processing similarity of two tweets...')
corpus = list()
corpus.append(processed_tweet)
corpus.extend(same_day_tweets)
logger.info('Corpus: ' + str(corpus))
sparse_matrix = count_vectorizer.fit_transform(corpus)
similarity_matrix = cosine_similarity(sparse_matrix, sparse_matrix)
print(similarity_matrix)
for row in similarity_matrix:
for column in row:
if column > app_config.SIMILARITY_THRESHOLD:
return True
return False
23 changes: 23 additions & 0 deletions app/services/search/controller.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from dateutil.parser import parse


from app.logger.logger import logger
from app.config.config import app_config
from app.util.date_checker import valid, format_for_date
from app.services.search import search


def search_controller(entities):
if not entities or not entities['user_id']:
return {}
logger.info('Searching for tweet using Twitter API...')
querystring = {
app_config.TWEET_USERNAME_KEY: entities['user_id'], app_config.TWEET_COUNT_KEY: app_config.TWEET_COUNT}
response = search.search_results(querystring)
same_day_tweets = list()
for entry in response:
tweet_date = parse(entry[app_config.TWEET_DATE_KEY])
if format_for_date(tweet_date) == format_for_date(entities['date']) and valid(tweet_date):
logger.info('Tweet found...: ' + str(entry[app_config.TWEET_TEXT_KEY]))
same_day_tweets.append(entry[app_config.TWEET_TEXT_KEY])
return same_day_tweets
12 changes: 6 additions & 6 deletions app/services/search/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
from app.config.config import app_config


def search_results(entities):
if not entities['date'] or not entities['user_id'] or not entities['tweet']:
raise ValueError
def search_results(querystring):
if not querystring:
return []
headers = {'Authorization': 'Bearer ' + app_config.TWITTER_ACCESSTOKEN}
querystring = {'screen_name': entities['user_id'] , 'count': app_config.TWEET_COUNT}
search_url = urljoin(app_config.TWITTER_HOSTNAME + '/' +
app_config.TWITTER_APIVER + '/', app_config.TWITTER_CONTEXT)
r = requests.get(search_url, headers=headers, querystring=querystring)
r = requests.get(search_url, headers=headers, params=querystring)
response = r.json()
return response['statuses']
logger.info('Response for Twitter API: ' + str(r.status_code))
return response
17 changes: 17 additions & 0 deletions app/util/date_checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from datetime import datetime, timezone

from app.config.config import app_config
from app.logger.logger import logger


def valid(processed_date):
if not processed_date:
return False
curr_date = datetime.now(timezone.utc)
datetime_diff = curr_date - processed_date
if datetime_diff.days > app_config.TWEET_MAX_OLD:
return False
return True

def format_for_date(tweet_datetime:datetime):
return tweet_datetime.strftime('%Y-%m-%d')
13 changes: 12 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,34 @@ autopep8==1.4.4
certifi==2019.3.9
chardet==3.0.4
Click==7.0
cycler==0.10.0
decorator==4.4.0
dnspython==1.16.0
eventlet==0.25.0
Flask==1.0.3
greenlet==0.4.15
gunicorn==19.9.0
idna==2.8
imageio==2.5.0
itsdangerous==1.1.0
Jinja2==2.10.1
joblib==0.13.2
kiwisolver==1.1.0
MarkupSafe==1.1.1
monotonic==1.5
networkx==2.3
numpy==1.16.4
Pillow==6.0.0
pycodestyle==2.5.0
pyparsing==2.4.0
pytesseract==0.2.6
python-dateutil==2.8.0
pytz==2019.1
regex==2019.6.8
PyWavelets==1.0.3
regex
requests==2.22.0
scikit-learn==0.21.2
scipy==1.3.0
six==1.12.0
urllib3==1.25.3
Werkzeug==0.15.4

0 comments on commit 66c41b7

Please sign in to comment.