Skip to content

Commit

Permalink
Merge pull request #85 from brave/move-to-pytorch
Browse files Browse the repository at this point in the history
Switch from tensorflow to sentence-transformer
  • Loading branch information
LorenzoMinto authored Aug 9, 2023
2 parents 0b9b465 + 2004c42 commit 389913f
Show file tree
Hide file tree
Showing 7 changed files with 140 additions and 83 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ jobs:
aws-secret-access-key: ${{ secrets.GDBP_AWS_SECRET_ACCESS_KEY }}
aws-region: us-west-2
- name: Generalized Deployments
uses: brave-intl/general-docker-build-pipeline-action@v1.0.9
uses: brave-intl/general-docker-build-pipeline-action@v1
39 changes: 27 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,32 @@
# brave-news-source-suggestion

Pipeline for producing the source embedding representations and similarity matrix needed for source suggestion feature in Brave News.
Service for producing the source embedding representations and similarity matrix needed for source suggestion feature in Brave News.

## Scripts
Run the scripts in the order in which they are presented.

`source-feed-accumulator.py`: parses periodically Brave News's feed, creating articles buckets for each source. These buckets are collected in `articles_history.csv` and catalogued by the `publisher_id` attribute.

`sources-similarity-matrix.py`: takes in the source buckets and produces an 512-dimensional embedding for each source, built as the mean of the 512-dimensional embeddings of all articles belonging to the source, as generated by the Universal Sentence Encoder model (https://arxiv.org/abs/1803.11175).
## Installation

## Outputs

`source_embeddings.csv`: [`index | publisher_id | 0 | 1 ... | ... 511`] stores all the 512-dimensional embeddings for each source under its `publisher_name`.

`source_similarity_t10.json` stores the top-10 most similar sources, with similarity score, for each source.
```
pip install -r requirements.txt
```

## Scripts
**source-feed-accumulator.py**: parses Brave News feed periodically, collecting articles for each source in `articles_history.csv`. For each article, we store the `publisher_id` attribute.

**sources-similarity-matrix.py**: takes as input the article history and produces a 384-dimensional embedding for each source, using the `sentence-transformer` package. More in particular:
- `all-MiniLM-L6-v2` for english language sources.
- `paraphrase-multilingual-MiniLM-L12-v2` for non-english language sources.
Once all source embeddings are generated, a pairwise source similarity matrix is produced.

## Running locally
To collect and accumulate article history:
```
export NO_UPLOAD=1
export NO_DOWNLOAD=1
python source-feed-accumulator.py
```

To computed source embeddings and produce the source similarity matrix:
```
export NO_UPLOAD=1
export NO_DOWNLOAD=1
python sources-similarity-matrix.py
```
17 changes: 11 additions & 6 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
BRAVE_TODAY_CLOUDFRONT_CANONICAL_ID = os.getenv('BRAVE_TODAY_CLOUDFRONT_CANONICAL_ID', None)

LANG_REGION_MODEL_MAP = os.getenv('LANG_REGION_MODEL_MAP', [
('en_US', "https://tfhub.dev/google/universal-sentence-encoder/4"),
('en_CA', "https://tfhub.dev/google/universal-sentence-encoder/4"),
('es_ES', 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'),
('es_MX', 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'),
('pt_BR', 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'),
('ja_JP', 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'),
('en_US', "sentence-transformers/all-MiniLM-L6-v2"),
('en_CA', "sentence-transformers/all-MiniLM-L6-v2"),
('en_GB', "sentence-transformers/all-MiniLM-L6-v2"),
('es_ES', 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'),
('es_MX', 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'),
('pt_BR', 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'),
('ja_JP', 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'),
])

SOURCES_JSON_FILE = os.getenv('SOURCES_JSON_FILE', 'sources.{LANG_REGION}')
Expand All @@ -32,3 +33,7 @@
SOURCE_SIMILARITY_T10_HR = os.getenv('SOURCE_SIMILARITY_T10_HR', "source_similarity_t10_hr.{LANG_REGION}")

SOURCE_EMBEDDINGS = os.getenv('SOURCE_EMBEDDINGS', "SOURCE_EMBEDDINGS.{LANG_REGION}")

if SENTRY_URL := os.getenv('SENTRY_URL'):
import sentry_sdk
sentry_sdk.init(dsn=SENTRY_URL, traces_sample_rate=0)
31 changes: 31 additions & 0 deletions embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import numpy as np
from sentence_transformers import util
from structlog import get_logger

import config

EMBEDDING_DIMENSIONALITY = 384

logger = get_logger()


def compute_source_similarity(source_1, source_2, function='cosine'):
if function == 'dot':
return util.dot_score(source_1, np.transpose(source_2))
elif function == 'cosine':
return util.pytorch_cos_sim(source_1, source_2)[0][0]


def get_source_representation_from_titles(titles, model):
if len(titles) < config.MINIMUM_ARTICLE_HISTORY_SIZE:
return np.zeros((1, EMBEDDING_DIMENSIONALITY))

return model.encode(titles).mean(axis=0)


def compute_source_representation_from_articles(articles_df, publisher_id, model):
publisher_bucket_df = articles_df[articles_df.publisher_id == publisher_id]

titles = [
title for title in publisher_bucket_df.title.to_numpy() if title is not None]
return get_source_representation_from_titles(titles, model)
14 changes: 8 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
feedparser==6.0.10
numpy==1.23.5
pandas==1.5.1
requests==2.28.1
requests==2.31.0
scipy==1.9.3
tensorflow==2.9.3
tensorflow_text==2.9.0
tensorflow_hub==0.12.0
tqdm==4.64.1
sentence-transformers==2.2.2
sentry-sdk==1.28.1
tqdm==4.65.0
boto3==1.26.14
botocore==1.29.14
structlog==22.2.0
structlog==22.3.0
torch==2.0.1
torchvision==0.15.2
transformers==4.31.0
94 changes: 37 additions & 57 deletions source-similarity-matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,55 +4,24 @@

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
from sentence_transformers import SentenceTransformer
from structlog import get_logger
from tqdm import tqdm

import config
from utils import download_file, upload_file
from embeddings import (EMBEDDING_DIMENSIONALITY,
compute_source_representation_from_articles,
compute_source_similarity)
from utils import (clean_source_similarity_file, download_file,
get_source_id_for_title, upload_file)

logger = get_logger()


def embed(input):
return model(input)


# Take centroid of 512-d embeddings
def get_source_representation_from_titles(titles):
source_repr = np.zeros((1, 512))
if len(titles) < config.MINIMUM_ARTICLE_HISTORY_SIZE:
return source_repr

for title in titles:
source_repr += embed([title])[0]
norm_repr = tf.nn.l2_normalize(source_repr / len(titles), axis=1)
return norm_repr.numpy()


def compute_source_similarity(source1, source2, t='dot'):
cosine_similarities = np.dot(source1, np.transpose(source2))
clip_cosine_similarity = tf.clip_by_value(cosine_similarities, -1.0, 1.0)
score = 1.0 - tf.acos(clip_cosine_similarity) / math.pi
return score


def compute_source_representation_from_articles(articles_df, publisher_id):
publisher_bucket_df = articles_df[articles_df.publisher_id == publisher_id]

source_titles = [title for title in publisher_bucket_df.title.to_numpy() if title is not None]
return get_source_representation_from_titles(source_titles)


def get_source_id_for_title(title, sources_df):
return sources_df[sources_df.publisher_name == title].publisher_id.to_numpy()[0]


# Compute similarity matrix for all existing LANG_REGION pairs
for lang_region, model_url in config.LANG_REGION_MODEL_MAP:
logger.info(f"Started computing similarity matrix for {lang_region} using {model_url}")
for lang_region, model_name in config.LANG_REGION_MODEL_MAP:
logger.info(
f"Started computing similarity matrix for {lang_region} using {model_name}")

pathlib.Path(config.OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

Expand All @@ -76,24 +45,28 @@ def get_source_id_for_title(title, sources_df):
header=None)
articles_df.columns = ['title', 'description', 'timestamp', 'publisher_id']

logger.info("Loading Universal Sentence Encoder...")
module_url = model_url # @param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
logger.info(f"USE ({module_url}) loaded")
logger.info("Loading Embedding Model...")
model = SentenceTransformer(model_name)
logger.info(f"Model ({model_name}) loaded")

logger.info("Building sources embeddings...")
publisher_ids = sources_df.publisher_id.to_numpy()
logger.info(f"Publisher ids size: {publisher_ids.size}")

# For each publisher, compute source representations from all stored
# articles for that publisher.
reprs = np.zeros((publisher_ids.size, 512))
reprs = np.zeros((publisher_ids.size, EMBEDDING_DIMENSIONALITY))
for i, publisher_id in tqdm(enumerate(publisher_ids)):
reprs[i, :] = compute_source_representation_from_articles(articles_df, publisher_id)
reprs[i, :] = compute_source_representation_from_articles(
articles_df, publisher_id, model)
if not reprs[i, :].any():
logger.warning(
f"Source {sources_df[sources_df.publisher_id == publisher_id].publisher_name.item()} has no articles. Skipping...")

logger.info(f"Computing sources representations for {lang_region}")
sources_representation = pd.DataFrame({'publisher_id': publisher_ids})
sources_representation = pd.concat([sources_representation, pd.DataFrame(reprs)], axis=1)
sources_representation = pd.concat(
[sources_representation, pd.DataFrame(reprs)], axis=1)
sources_representation.to_csv(
f'output/{config.SOURCE_EMBEDDINGS.format(LANG_REGION=lang_region)}.csv', header=None)
logger.info("Finished building source embeddings.")
Expand Down Expand Up @@ -130,29 +103,36 @@ def get_source_id_for_title(title, sources_df):
# Only include suggestion if within 10% of the best match's score
top_similarity_score = 0.0
if sources_ranking:
top_similarity_score = sources_ranking[0][1]
top_similarity_score = sources_ranking[0][1]
similarity_cutoff = config.SIMILARITY_CUTOFF_RATIO * top_similarity_score
top10_dictionary[source_id] = [{'source': get_source_id_for_title(source[0], sources_df), 'score': source[1]} for
top10_dictionary[source_id] = [{'source': get_source_id_for_title(source[0], sources_df), 'score': source[1]}
for
source in sources_ranking[:10] if source[1] > similarity_cutoff]
top10_dictionary_human_readable[feed] = [{'source': source[0], 'score': source[1]} for source in
sources_ranking[:10] if source[1] > similarity_cutoff]

logger.info("Removing un-matched sources")
top10_dictionary = clean_source_similarity_file(
sources_data, top10_dictionary)

logger.info("Outputting sources similarities files")
with open(f'output/{config.SOURCE_SIMILARITY_T10.format(LANG_REGION=lang_region)}.json', 'w', encoding='utf-8') as f:
json.dump(top10_dictionary, f, ensure_ascii=True)
with open(f'output/{config.SOURCE_SIMILARITY_T10_HR.format(LANG_REGION=lang_region)}.json', 'w', encoding='utf-8') as f:
json.dump(top10_dictionary_human_readable,
f, ensure_ascii=True)
with open(f'output/{config.SOURCE_SIMILARITY_T10.format(LANG_REGION=lang_region)}.json', 'w') as f:
json.dump(top10_dictionary, f)
with open(f'output/{config.SOURCE_SIMILARITY_T10_HR.format(LANG_REGION=lang_region)}.json', 'w') as f:
json.dump(top10_dictionary_human_readable, f)

logger.info("Script has finished running.")

if not config.NO_UPLOAD:
upload_file(config.OUTPUT_DIR + "/" + f'/{config.SOURCE_SIMILARITY_T10.format(LANG_REGION=lang_region)}.json',
config.PUB_S3_BUCKET,
f"source-suggestions/{config.SOURCE_SIMILARITY_T10.format(LANG_REGION=lang_region)}.json")

upload_file(config.OUTPUT_DIR + "/" + f'/{config.SOURCE_SIMILARITY_T10_HR.format(LANG_REGION=lang_region)}.json',
config.PUB_S3_BUCKET,
f"source-suggestions/{config.SOURCE_SIMILARITY_T10_HR.format(LANG_REGION=lang_region)}.json")
upload_file(
config.OUTPUT_DIR + "/" +
f'/{config.SOURCE_SIMILARITY_T10_HR.format(LANG_REGION=lang_region)}.json',
config.PUB_S3_BUCKET,
f"source-suggestions/{config.SOURCE_SIMILARITY_T10_HR.format(LANG_REGION=lang_region)}.json")

upload_file(config.OUTPUT_DIR + "/" + f'/{config.SOURCE_EMBEDDINGS.format(LANG_REGION=lang_region)}.csv',
config.PUB_S3_BUCKET,
Expand Down
26 changes: 25 additions & 1 deletion utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
import mimetypes

import boto3
import numpy as np
from botocore.exceptions import ClientError

import config
Expand All @@ -17,9 +19,11 @@ def upload_file(file_name, bucket, object_name=None):
if object_name is None:
object_name = file_name
try:
content_type = mimetypes.guess_type(file_name)[0] or 'binary/octet-stream'
s3_client.upload_file(file_name, bucket, object_name, ExtraArgs={
'GrantRead': f'id={config.BRAVE_TODAY_CLOUDFRONT_CANONICAL_ID}',
'GrantFullControl': f'id={config.BRAVE_TODAY_CANONICAL_ID}'
'GrantFullControl': f'id={config.BRAVE_TODAY_CANONICAL_ID}',
'ContentType': content_type
})

except ClientError as e:
Expand All @@ -39,3 +43,23 @@ def download_file(file_name, bucket, object_name=None):
logging.error(e)
return False
return True


def clean_source_similarity_file(sources_data, sources_sim_data):
sources_id = [sources.get("publisher_id") for sources in sources_data]

for s_id in sources_id:
if s_id not in sources_sim_data:
sources_sim_data.pop(s_id, None)
continue

if s_id in sources_sim_data:
for index, suggestion in enumerate(sources_sim_data[s_id]):
if suggestion["source"] not in sources_id:
sources_sim_data[s_id].pop(index)

return sources_sim_data


def get_source_id_for_title(title, sources_df):
return sources_df[sources_df.publisher_name == title].publisher_id.to_numpy()[0]

0 comments on commit 389913f

Please sign in to comment.