Skip to content

Commit

Permalink
add rebuttal and 2024 baseline things
Browse files Browse the repository at this point in the history
  • Loading branch information
ritagonmar committed Feb 27, 2024
1 parent 569b3d8 commit d55b1ee
Show file tree
Hide file tree
Showing 78 changed files with 19,201 additions and 682 deletions.
18 changes: 12 additions & 6 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,28 @@ share/python-wheels/
data
results/variables
results/figures/bert-models
results/figures/old
results/figures/tsne*
results/figures/retracted*
results/figures/extra
results/figures/2024_baseline
results/figures/*.pdf
results/figures/*.png
scripts/*.ann
scripts/BERT-based-embeddings/07-rgm-runtimes-experiment.ipynb
scripts/BERT-based-embeddings/08-rgm-tokenizer-experiment.ipynb
scripts/BERT-based-embeddings/09-rgm-sanity-check-PubMedBERT.ipynb
scripts/BERT-based-embeddings/sBERT.py
scripts/BERT-based-embeddings/10-rgm-ls-PubMedNCL.ipynb
scripts/BERT-based-embeddings/12-rgm-SVD-RP-experiment.ipynb
scripts/BERT-based-embeddings/14-rgm-TF-IDF-covid-island.ipynb
scripts/BERT-based-embeddings/15-rgm-titles-exploration.ipynb
scripts/BERT-based-embeddings/sBERT.py
scripts/BERT-based-embeddings/Untitled.ipynb
scripts/BERT-based-embeddings/20ng.ipynb
scripts/18-rgm-figure-top10-genes.ipynb
scripts/19-rgm-data-generate-csv.ipynb
scripts/15-rgm-analysis-retracted-papers-LOCAL.ipynb
scripts/18-rgm-analysis-affiliations-2021-baseline-LOCAL.ipynb
scripts/19-rgm-data-generate-csv.ipynb
scripts/20-rgm-figure-top10-genes.ipynb
scripts/pmids.txt
scripts/2024-baseline/19.2-rgm-data-generate-csv.ipynb
scripts/2024-baseline/04-rgm-analysis-affiliations-LOCAL.ipynb
*.egg
MANIFEST

Expand Down
158 changes: 151 additions & 7 deletions pubmed_landscape_src/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import xml.etree.ElementTree as et
import os
import torch
import numpy as np

def xml_import(xml_file):
"""Parses some elements of the metadata in PubMed XML files.
Expand Down Expand Up @@ -60,12 +61,16 @@ def xml_import(xml_file):
into one (due to the assymetry of the date storage, sometimes with <Day>, <Month> and <Year>, other times
with <MedlineDate>). If <PubDate> contains no further childs, it will print ' '.
- First author first name: It parses the <ForeName> of the first <Author> listed in <Authorlist>. Note that sometimes the metadata is not perfect inside the tag there is the complete name, including surnames. If there is no tag <ForeName>, 'no tag' will be appended. Note for the future: Maybe this is misses some names directly listed in the tag <Author>, maybe an approach similar to what I do for abstracts would be better, where everything under the <Author> tag is parsed. In that case we would also have surnames, but that can be cleaned after.
- First author first name: It parses the <ForeName> of the first <Author> listed in <Authorlist>. Note that sometimes the metadata is not perfect inside the tag there is the complete name, including surnames. If there is no tag <ForeName>, 'no tag' will be appended. Note for the future: Maybe this is misses some names directly listed in the tag <Author>, maybe an approach similar to what I do for abstracts would be better, where everything under the <Author> tag is parsed. In that case we would also have surnames, but that can be cleaned after. Update: actually not really because affiliation is also saved under <Author>. It is fine the way it is.
- Last authors first name: It parses the <ForeName> of the last <Author> listed in <Authorlist>. Note that sometimes the metadata is not perfect inside the tag there is the complete name, including surnames. If there is no tag <ForeName>, 'no tag' will be appended. Note for the future: maybe same problem as in first authors.
- ISSN (stored in <ISSN>): If there is no tag <ISSN>, it will add 'no tag'. If <ISSN> contains no text, it will add '' (empty string).
- Affiliation first author:
- Affiliation last author:
"""


Expand Down Expand Up @@ -217,7 +222,7 @@ def xml_import(xml_file):
tag=author.find('ForeName')

if tag is None:
ros.append(['no tag'])
ros.append("")
else:
res=[]
if not tag.text:
Expand Down Expand Up @@ -245,10 +250,7 @@ def xml_import(xml_file):
for elem in author.iter('Author'):
tag=elem.find('ForeName')
if tag is None:
res.append(['no tag'])
#for the next time, this is bad because then the element is a list and not a string
#it should be:
# res.append('no tag')
res.append("")
else:
if not tag.text:
res.append('')
Expand Down Expand Up @@ -280,6 +282,66 @@ def xml_import(xml_file):
ros=[' '.join(ele) for ele in ros]
dicc['ISSN']=ros


# Affiliation of the first author
ros = []
for child1 in xroot:
for child2 in child1:
for child3 in child2:
for child4 in child3.iter("Article"):
authorlist = child4.find("AuthorList")
if authorlist is None:
ros.append("")
else:
for elem in child4.iter("AuthorList"):
author = elem.find("Author")
affil = author.find("AffiliationInfo")
if affil is None:
ros.append("")
else:
tag = affil.find("Affiliation")
if tag is None:
ros.append("")
else:
res = []
if not tag.text:
res.append("")
else:
res.append(tag.text)
ros.append(res)

ros = [" ".join(ele) for ele in ros]
dicc['AffiliationFirstAuthor']=ros


# Affiliation of the last author
ros = []
for child1 in xroot:
for child2 in child1:
for child3 in child2:
for child4 in child3.iter("Article"):
authorlist = child4.find("AuthorList")
if authorlist is None:
ros.append("")
else:
for author in child4.iter("AuthorList"):
res = []
for elem in author.iter("Author"):
affil = elem.find("AffiliationInfo")
if affil is None:
res.append("")
else:
tag = affil.find("Affiliation")
if tag is None:
res.append("")
else:
if not tag.text:
res.append("")
else:
res.append(tag.text)
ros.append(res[-1])

dicc['AffiliationLastAuthor']=ros

out=pd.DataFrame.from_dict(dicc)
return out, dicc
Expand Down Expand Up @@ -313,6 +375,7 @@ def import_all_files(path, order_files=False):

name_files_array=np.array(name_files)
name_xml_files=name_files_array[len_filenames==17]
name_xml_files.sort()

# import
frame_all_df=[]
Expand Down Expand Up @@ -373,4 +436,85 @@ def generate_embeddings(abstracts, tokenizer, model, device):
embedding_cls = outputs[:, 0, :].numpy()


return embedding_cls, embedding_sep, embedding_av
return embedding_cls, embedding_sep, embedding_av

def mean_pooling(token_embeds, attention_mask):
"""Returns [AVG] token.
Returns average embedding of the embeddings of all tokens of a corpus ([AVG]).
Parameters
----------
token_embeds : torch of shape (n_documents, 512, 768)
First element of model_output contains all token embeddings (model_output[0])
attention_mask : inputs["attention_mask"], inputs being the output of the tokenizer
"""
input_mask_expanded = (
attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
)
sum_embeddings = torch.sum(token_embeds * input_mask_expanded, 1)
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
return sum_embeddings / sum_mask


def sep_pooling(token_embeds, attention_mask):
"""Returns [SEP] token
Returns [SEP] token from all the embeddings of all tokens of a corpus.
Parameters
----------
token_embeds : torch of shape (n_documents, 512, 768)
First element of model_output contains all token embeddings (model_output[0])
attention_mask : inputs["attention_mask"], inputs being the output of the tokenizer
"""
ix = attention_mask.sum(1) - 1
ix0 = torch.arange(attention_mask.size(0))
return token_embeds[ix0, ix, :]


@torch.no_grad()
def generate_embeddings_batches(abstracts, tokenizer, model, device):
"""Generate embeddings using BERT-based model.
Parameters
----------
abstracts : list, this has to be a list not sure if array works but pandas do not work
Abstract texts.
tokenizer : transformers.models.bert.tokenization_bert_fast.BertTokenizerFast
Tokenizer.
model : transformers.models.bert.modeling_bert.BertModel
BERT-based model.
device : str, {"cuda", "cpu"}
"cuda" if torch.cuda.is_available() else "cpu".
Returns
-------
embedding_cls : ndarray
[CLS] tokens of the abstracts.
embedding_sep : ndarray
[SEP] tokens of the abstracts.
embedding_av : ndarray
Average of tokens of the abstracts.
"""
# preprocess the input
inputs = tokenizer(
abstracts,
padding=True,
truncation=True,
return_tensors="pt",
max_length=512,
).to(device)

with torch.no_grad():
model.eval()
out = model(**inputs)
token_embeds = out[0] # get the last hidden state
av = mean_pooling(token_embeds, inputs["attention_mask"])
sep = sep_pooling(token_embeds, inputs["attention_mask"])
cls = token_embeds[:, 0, :]
embedding_av = av.detach().cpu().numpy()
embedding_sep = sep.detach().cpu().numpy()
embedding_cls = cls.detach().cpu().numpy()

return embedding_cls, embedding_sep, embedding_av
16 changes: 11 additions & 5 deletions pubmed_landscape_src/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,10 +819,15 @@ def plot_tsne_colors(tsne, colors, x_lim, y_lim, ax=None, plot_type=None, axis_o
if ax is None:
fig, ax = plt.subplots()

s_grey = 0.1
# s_grey = 0.1
# s_color = 0.5
# alpha_grey = 0.2
# alpha_color = 0.2

s_grey = 0.5
s_color = 0.5
alpha_grey = 0.2
alpha_color = 0.2
alpha_grey = 0.6
alpha_color = 0.7

if plot_type=='subplot_2':
s_grey = 0.2
Expand Down Expand Up @@ -880,7 +885,7 @@ def plot_tsne_colors(tsne, colors, x_lim, y_lim, ax=None, plot_type=None, axis_o



def plot_tsne_years(tsne, colors, x_lim, y_lim, ax=None, fontsize=7, plot_type=None, colorbar=True, colorbar_type=None, axis_on=False, rs = 42):
def plot_tsne_years(tsne, colors, x_lim, y_lim, ax=None, fontsize=7, plot_type=None, colorbar=True, colorbar_type=None, axis_on=False, rs = 42, top_year="2021"):
"""Plot t-SNE embedding with colors (by years).
Parameters
Expand Down Expand Up @@ -912,6 +917,7 @@ def plot_tsne_years(tsne, colors, x_lim, y_lim, ax=None, fontsize=7, plot_type=N

assert x_lim[0] < x_lim[1], "xlim values are in the wrong order."
assert y_lim[0] < y_lim[1], "ylim values are in the wrong order."
assert type(top_year) == str, "top_year should be a string."

assert plot_type in [None, 'subplot', 'subregion', 'test'], "Not valid `plot_type` value. Choose from [None, 'subplot', 'subregion', 'test']."
assert colorbar_type in [None, 'neuroscience'], "Not valid `colorbar_type` value. Choose from [None, 'neuroscience']."
Expand Down Expand Up @@ -963,7 +969,7 @@ def plot_tsne_years(tsne, colors, x_lim, y_lim, ax=None, fontsize=7, plot_type=N

cbar.set_alpha(1)
cbar.ax.get_yaxis().set_ticks([0,1])
cbar.ax.get_yaxis().set_ticklabels(['1970','2021'])
cbar.ax.get_yaxis().set_ticklabels(['1970', top_year])
cbar.ax.tick_params(labelsize=fontsize)


Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion scripts/00-rgm-data-download.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2705,7 +2705,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.5"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion scripts/01-rgm-data-parse.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1934,7 +1934,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.5"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion scripts/02-ls-data-obtain-BERT-embeddings.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.5"
},
"varInspector": {
"cols": {
Expand Down
2 changes: 1 addition & 1 deletion scripts/04-rgm-pipeline-TFIDF.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -690,7 +690,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.5"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion scripts/09-rgm-analysis-covid-19.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"from pubmed_landscape_src.plotting import automatic_coloring"
"from pubmed_landscape_src.plotting import improved_coloring"
]
},
{
Expand Down
Loading

0 comments on commit d55b1ee

Please sign in to comment.