add rebuttal and 2024 baseline things

berenslab · Feb 27, 2024 · d55b1ee · d55b1ee
1 parent 569b3d8
commit d55b1ee
Show file tree

Hide file tree

Showing 78 changed files with 19,201 additions and 682 deletions.
diff --git a/.gitignore b/.gitignore
@@ -27,22 +27,28 @@ share/python-wheels/
 data
 results/variables
 results/figures/bert-models
-results/figures/old
-results/figures/tsne*
-results/figures/retracted*
+results/figures/extra
+results/figures/2024_baseline
+results/figures/*.pdf
+results/figures/*.png
 scripts/*.ann
 scripts/BERT-based-embeddings/07-rgm-runtimes-experiment.ipynb
 scripts/BERT-based-embeddings/08-rgm-tokenizer-experiment.ipynb
 scripts/BERT-based-embeddings/09-rgm-sanity-check-PubMedBERT.ipynb
-scripts/BERT-based-embeddings/sBERT.py
 scripts/BERT-based-embeddings/10-rgm-ls-PubMedNCL.ipynb
 scripts/BERT-based-embeddings/12-rgm-SVD-RP-experiment.ipynb
+scripts/BERT-based-embeddings/14-rgm-TF-IDF-covid-island.ipynb
+scripts/BERT-based-embeddings/15-rgm-titles-exploration.ipynb
+scripts/BERT-based-embeddings/sBERT.py
 scripts/BERT-based-embeddings/Untitled.ipynb
 scripts/BERT-based-embeddings/20ng.ipynb
-scripts/18-rgm-figure-top10-genes.ipynb
-scripts/19-rgm-data-generate-csv.ipynb
 scripts/15-rgm-analysis-retracted-papers-LOCAL.ipynb
+scripts/18-rgm-analysis-affiliations-2021-baseline-LOCAL.ipynb
+scripts/19-rgm-data-generate-csv.ipynb
+scripts/20-rgm-figure-top10-genes.ipynb
 scripts/pmids.txt
+scripts/2024-baseline/19.2-rgm-data-generate-csv.ipynb
+scripts/2024-baseline/04-rgm-analysis-affiliations-LOCAL.ipynb
 *.egg
 MANIFEST
 

diff --git a/pubmed_landscape_src/data.py b/pubmed_landscape_src/data.py
@@ -2,6 +2,7 @@
 import xml.etree.ElementTree as et
 import os
 import torch
+import numpy as np
 
 def xml_import(xml_file):
     """Parses some elements of the metadata in PubMed XML files.
@@ -60,12 +61,16 @@ def xml_import(xml_file):
     into one (due to the assymetry of the date storage, sometimes with <Day>, <Month> and <Year>, other times 
     with <MedlineDate>). If <PubDate> contains no further childs, it will print ' '.
     
-    - First author first name: It parses the <ForeName> of the first <Author> listed in <Authorlist>. Note that sometimes the metadata is not perfect inside the tag there is the complete name, including surnames. If there is no tag <ForeName>, 'no tag' will be appended. Note for the future: Maybe this is misses some names directly listed in the tag <Author>, maybe an approach similar to what I do for abstracts would be better, where everything under the <Author> tag is parsed. In that case we would also have surnames, but that can be cleaned after.
+    - First author first name: It parses the <ForeName> of the first <Author> listed in <Authorlist>. Note that sometimes the metadata is not perfect inside the tag there is the complete name, including surnames. If there is no tag <ForeName>, 'no tag' will be appended. Note for the future: Maybe this is misses some names directly listed in the tag <Author>, maybe an approach similar to what I do for abstracts would be better, where everything under the <Author> tag is parsed. In that case we would also have surnames, but that can be cleaned after. Update: actually not really because affiliation is also saved under <Author>. It is fine the way it is.
     
     - Last authors first name: It parses the <ForeName> of the last <Author> listed in <Authorlist>. Note that sometimes the metadata is not perfect inside the tag there is the complete name, including surnames. If there is no tag <ForeName>, 'no tag' will be appended. Note for the future: maybe same problem as in first authors.
     
     - ISSN (stored in <ISSN>): If there is no tag <ISSN>, it will add 'no tag'. If <ISSN> contains no text, it will add '' (empty string). 
 
+    - Affiliation first author: 
+
+    - Affiliation last author:
+
     """
 
 
@@ -217,7 +222,7 @@ def xml_import(xml_file):
                             tag=author.find('ForeName')
 
                             if tag is None:
-                                ros.append(['no tag'])
+                                ros.append("")
                             else:
                                 res=[]
                                 if not tag.text:
@@ -245,10 +250,7 @@ def xml_import(xml_file):
                             for elem in author.iter('Author'):
                                 tag=elem.find('ForeName')
                                 if tag is None:
-                                    res.append(['no tag'])
-                                    #for the next time, this is bad because then the element is a list and not a string
-                                    #it should be:
-                                    # res.append('no tag')  
+                                    res.append("") 
                                 else:
                                     if not tag.text:
                                         res.append('')
@@ -280,6 +282,66 @@ def xml_import(xml_file):
     ros=[' '.join(ele) for ele in ros]
     dicc['ISSN']=ros
 
+
+    # Affiliation of the first author 
+    ros = []
+    for child1 in xroot:
+        for child2 in child1:
+            for child3 in child2:
+                for child4 in child3.iter("Article"):
+                    authorlist = child4.find("AuthorList")
+                    if authorlist is None:
+                        ros.append("")
+                    else:
+                        for elem in child4.iter("AuthorList"):
+                            author = elem.find("Author")
+                            affil = author.find("AffiliationInfo")
+                            if affil is None:
+                                ros.append("")  
+                            else:
+                                tag = affil.find("Affiliation")
+                                if tag is None:
+                                    ros.append("")  
+                                else:
+                                    res = []
+                                    if not tag.text:
+                                        res.append("")
+                                    else:
+                                        res.append(tag.text)
+                                    ros.append(res)
+
+    ros = [" ".join(ele) for ele in ros]
+    dicc['AffiliationFirstAuthor']=ros
+
+
+    # Affiliation of the last author
+    ros = []
+    for child1 in xroot:
+        for child2 in child1:
+            for child3 in child2:
+                for child4 in child3.iter("Article"):
+                    authorlist = child4.find("AuthorList")
+                    if authorlist is None:
+                        ros.append("")
+                    else:
+                        for author in child4.iter("AuthorList"):
+                            res = []
+                            for elem in author.iter("Author"):
+                                affil = elem.find("AffiliationInfo")
+                                if affil is None:
+                                    res.append("")
+                                else:
+                                    tag = affil.find("Affiliation")
+                                    if tag is None:
+                                        res.append("")
+                                    else:
+                                        if not tag.text:
+                                            res.append("")
+                                        else:
+                                            res.append(tag.text)
+                            ros.append(res[-1])
+
+    dicc['AffiliationLastAuthor']=ros
 
     out=pd.DataFrame.from_dict(dicc)
     return out, dicc
@@ -313,6 +375,7 @@ def import_all_files(path, order_files=False):
 
     name_files_array=np.array(name_files)
     name_xml_files=name_files_array[len_filenames==17]
+    name_xml_files.sort()
 
     # import
     frame_all_df=[]
@@ -373,4 +436,85 @@ def generate_embeddings(abstracts, tokenizer, model, device):
     embedding_cls = outputs[:, 0, :].numpy()
 
 
-    return embedding_cls, embedding_sep, embedding_av 
+    return embedding_cls, embedding_sep, embedding_av 
+
+def mean_pooling(token_embeds, attention_mask):
+    """Returns [AVG] token.
+    Returns average embedding of the embeddings of all tokens of a corpus ([AVG]).
+
+    Parameters
+    ----------
+    token_embeds : torch of shape (n_documents, 512, 768)
+        First element of model_output contains all token embeddings (model_output[0])
+    attention_mask : inputs["attention_mask"], inputs being the output of the tokenizer
+
+    """
+    input_mask_expanded = (
+        attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
+    )
+    sum_embeddings = torch.sum(token_embeds * input_mask_expanded, 1)
+    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    return sum_embeddings / sum_mask
+
+
+def sep_pooling(token_embeds, attention_mask):
+    """Returns [SEP] token
+    Returns [SEP] token from all the embeddings of all tokens of a corpus.
+
+    Parameters
+    ----------
+    token_embeds : torch of shape (n_documents, 512, 768)
+        First element of model_output contains all token embeddings (model_output[0])
+    attention_mask : inputs["attention_mask"], inputs being the output of the tokenizer
+
+    """
+    ix = attention_mask.sum(1) - 1
+    ix0 = torch.arange(attention_mask.size(0))
+    return token_embeds[ix0, ix, :]
+
+
+@torch.no_grad()
+def generate_embeddings_batches(abstracts, tokenizer, model, device):
+    """Generate embeddings using BERT-based model.
+
+    Parameters
+    ----------
+    abstracts : list, this has to be a list not sure if array works but pandas do not work
+        Abstract texts.
+    tokenizer : transformers.models.bert.tokenization_bert_fast.BertTokenizerFast
+        Tokenizer.
+    model : transformers.models.bert.modeling_bert.BertModel
+        BERT-based model.
+    device : str, {"cuda", "cpu"}
+        "cuda" if torch.cuda.is_available() else "cpu".
+
+    Returns
+    -------
+    embedding_cls : ndarray
+        [CLS] tokens of the abstracts.
+    embedding_sep : ndarray
+        [SEP] tokens of the abstracts.
+    embedding_av : ndarray
+        Average of tokens of the abstracts.
+    """
+    # preprocess the input
+    inputs = tokenizer(
+        abstracts,
+        padding=True,
+        truncation=True,
+        return_tensors="pt",
+        max_length=512,
+    ).to(device)
+
+    with torch.no_grad():
+        model.eval()
+        out = model(**inputs)
+        token_embeds = out[0]  # get the last hidden state
+        av = mean_pooling(token_embeds, inputs["attention_mask"])
+        sep = sep_pooling(token_embeds, inputs["attention_mask"])
+        cls = token_embeds[:, 0, :]
+        embedding_av = av.detach().cpu().numpy()
+        embedding_sep = sep.detach().cpu().numpy()
+        embedding_cls = cls.detach().cpu().numpy()
+
+    return embedding_cls, embedding_sep, embedding_av
diff --git a/pubmed_landscape_src/plotting.py b/pubmed_landscape_src/plotting.py
@@ -819,10 +819,15 @@ def plot_tsne_colors(tsne, colors, x_lim, y_lim, ax=None, plot_type=None, axis_o
     if ax is None:
         fig, ax = plt.subplots()
 
-    s_grey = 0.1
+    # s_grey = 0.1
+    # s_color = 0.5
+    # alpha_grey = 0.2
+    # alpha_color = 0.2
+
+    s_grey = 0.5
     s_color = 0.5
-    alpha_grey = 0.2
-    alpha_color = 0.2
+    alpha_grey = 0.6
+    alpha_color = 0.7
 
     if plot_type=='subplot_2':
         s_grey = 0.2
@@ -880,7 +885,7 @@ def plot_tsne_colors(tsne, colors, x_lim, y_lim, ax=None, plot_type=None, axis_o
 
 
 
-def plot_tsne_years(tsne, colors, x_lim, y_lim, ax=None, fontsize=7, plot_type=None, colorbar=True, colorbar_type=None, axis_on=False, rs = 42):
+def plot_tsne_years(tsne, colors, x_lim, y_lim, ax=None, fontsize=7, plot_type=None, colorbar=True, colorbar_type=None, axis_on=False, rs = 42, top_year="2021"):
     """Plot t-SNE embedding with colors (by years).
     
     Parameters
@@ -912,6 +917,7 @@ def plot_tsne_years(tsne, colors, x_lim, y_lim, ax=None, fontsize=7, plot_type=N
 
     assert x_lim[0] < x_lim[1], "xlim values are in the wrong order."
     assert y_lim[0] < y_lim[1], "ylim values are in the wrong order."
+    assert type(top_year) == str, "top_year should be a string."
 
     assert plot_type in [None, 'subplot', 'subregion', 'test'], "Not valid `plot_type` value. Choose from [None, 'subplot', 'subregion', 'test']."
     assert colorbar_type in [None, 'neuroscience'], "Not valid `colorbar_type` value. Choose from [None, 'neuroscience']."
@@ -963,7 +969,7 @@ def plot_tsne_years(tsne, colors, x_lim, y_lim, ax=None, fontsize=7, plot_type=N
 
         cbar.set_alpha(1)
         cbar.ax.get_yaxis().set_ticks([0,1])
-        cbar.ax.get_yaxis().set_ticklabels(['1970','2021'])
+        cbar.ax.get_yaxis().set_ticklabels(['1970', top_year])
         cbar.ax.tick_params(labelsize=fontsize)
 
 

diff --git a/results/figures/fig_1_general_embedding.pdf → ...figures/final/fig_1_general_embedding.pdf b/results/figures/fig_1_general_embedding.pdf → ...figures/final/fig_1_general_embedding.pdf
diff --git a/results/figures/fig_1_general_embedding.png → ...figures/final/fig_1_general_embedding.png b/results/figures/fig_1_general_embedding.png → ...figures/final/fig_1_general_embedding.png
diff --git a/results/figures/fig_2_covid.pdf → results/figures/final/fig_2_covid.pdf b/results/figures/fig_2_covid.pdf → results/figures/final/fig_2_covid.pdf
diff --git a/results/figures/fig_2_covid.png → results/figures/final/fig_2_covid.png b/results/figures/fig_2_covid.png → results/figures/final/fig_2_covid.png
diff --git a/results/figures/fig_3_nsc.pdf → results/figures/final/fig_3_nsc.pdf b/results/figures/fig_3_nsc.pdf → results/figures/final/fig_3_nsc.pdf
diff --git a/results/figures/fig_3_nsc.png → results/figures/final/fig_3_nsc.png b/results/figures/fig_3_nsc.png → results/figures/final/fig_3_nsc.png
diff --git a/results/figures/fig_4_high_res.pdf → results/figures/final/fig_4_high_res.pdf b/results/figures/fig_4_high_res.pdf → results/figures/final/fig_4_high_res.pdf
diff --git a/results/figures/fig_4_high_res.png → results/figures/final/fig_4_high_res.png b/results/figures/fig_4_high_res.png → results/figures/final/fig_4_high_res.png
diff --git a/results/figures/fig_4_ml.pdf → results/figures/final/fig_4_ml.pdf b/results/figures/fig_4_ml.pdf → results/figures/final/fig_4_ml.pdf
diff --git a/results/figures/fig_4_ml.png → results/figures/final/fig_4_ml.png b/results/figures/fig_4_ml.png → results/figures/final/fig_4_ml.png
diff --git a/results/figures/fig_5_genders.pdf → results/figures/final/fig_5_genders.pdf b/results/figures/fig_5_genders.pdf → results/figures/final/fig_5_genders.pdf
diff --git a/results/figures/fig_5_genders.png → results/figures/final/fig_5_genders.png b/results/figures/fig_5_genders.png → results/figures/final/fig_5_genders.png
diff --git a/results/figures/fig_6_retracted_papers.pdf → .../figures/final/fig_6_retracted_papers.pdf b/results/figures/fig_6_retracted_papers.pdf → .../figures/final/fig_6_retracted_papers.pdf
diff --git a/results/figures/fig_6_retracted_papers.png → .../figures/final/fig_6_retracted_papers.png b/results/figures/fig_6_retracted_papers.png → .../figures/final/fig_6_retracted_papers.png
diff --git a/results/figures/fig_S10_covid_ablation.pdf → .../figures/final/fig_S10_covid_ablation.pdf b/results/figures/fig_S10_covid_ablation.pdf → .../figures/final/fig_S10_covid_ablation.pdf
diff --git a/results/figures/fig_S10_covid_ablation.png → .../figures/final/fig_S10_covid_ablation.png b/results/figures/fig_S10_covid_ablation.png → .../figures/final/fig_S10_covid_ablation.png
diff --git a/results/figures/fig_S1_summary_data.pdf → ...lts/figures/final/fig_S1_summary_data.pdf b/results/figures/fig_S1_summary_data.pdf → ...lts/figures/final/fig_S1_summary_data.pdf
diff --git a/results/figures/fig_S1_summary_data.png → ...lts/figures/final/fig_S1_summary_data.png b/results/figures/fig_S1_summary_data.png → ...lts/figures/final/fig_S1_summary_data.png
diff --git a/...figures/fig_S2_interesting_embeddings.pdf → ...s/final/fig_S2_interesting_embeddings.pdf b/...figures/fig_S2_interesting_embeddings.pdf → ...s/final/fig_S2_interesting_embeddings.pdf
diff --git a/...figures/fig_S2_interesting_embeddings.png → ...s/final/fig_S2_interesting_embeddings.png b/...figures/fig_S2_interesting_embeddings.png → ...s/final/fig_S2_interesting_embeddings.png
diff --git a/...ts/figures/fig_S3_bert_colored_by_p_n.pdf → ...ures/final/fig_S3_bert_colored_by_p_n.pdf b/...ts/figures/fig_S3_bert_colored_by_p_n.pdf → ...ures/final/fig_S3_bert_colored_by_p_n.pdf
diff --git a/...ts/figures/fig_S3_bert_colored_by_p_n.png → ...ures/final/fig_S3_bert_colored_by_p_n.png b/...ts/figures/fig_S3_bert_colored_by_p_n.png → ...ures/final/fig_S3_bert_colored_by_p_n.png
diff --git a/...igures/fig_S4_general_embedding_tfidf.pdf → .../final/fig_S4_general_embedding_tfidf.pdf b/...igures/fig_S4_general_embedding_tfidf.pdf → .../final/fig_S4_general_embedding_tfidf.pdf
diff --git a/...igures/fig_S4_general_embedding_tfidf.png → .../final/fig_S4_general_embedding_tfidf.png b/...igures/fig_S4_general_embedding_tfidf.png → .../final/fig_S4_general_embedding_tfidf.png
diff --git a/results/figures/fig_S5_embeddings_grey.pdf → .../figures/final/fig_S5_embeddings_grey.pdf b/results/figures/fig_S5_embeddings_grey.pdf → .../figures/final/fig_S5_embeddings_grey.pdf
diff --git a/results/figures/fig_S5_embeddings_grey.png → .../figures/final/fig_S5_embeddings_grey.png b/results/figures/fig_S5_embeddings_grey.png → .../figures/final/fig_S5_embeddings_grey.png
diff --git a/...ures/fig_S6_embeddings_and_subregions.pdf → ...inal/fig_S6_embeddings_and_subregions.pdf b/...ures/fig_S6_embeddings_and_subregions.pdf → ...inal/fig_S6_embeddings_and_subregions.pdf
diff --git a/...ures/fig_S6_embeddings_and_subregions.png → ...inal/fig_S6_embeddings_and_subregions.png b/...ures/fig_S6_embeddings_and_subregions.png → ...inal/fig_S6_embeddings_and_subregions.png
diff --git a/...figures/fig_S7_bert_colored_by_length.pdf → ...s/final/fig_S7_bert_colored_by_length.pdf b/...figures/fig_S7_bert_colored_by_length.pdf → ...s/final/fig_S7_bert_colored_by_length.pdf
diff --git a/...figures/fig_S7_bert_colored_by_length.png → ...s/final/fig_S7_bert_colored_by_length.png b/...figures/fig_S7_bert_colored_by_length.png → ...s/final/fig_S7_bert_colored_by_length.png
diff --git a/...ts/figures/fig_S8_tsne_BERT_models_1M.pdf → ...ures/final/fig_S8_tsne_BERT_models_1M.pdf b/...ts/figures/fig_S8_tsne_BERT_models_1M.pdf → ...ures/final/fig_S8_tsne_BERT_models_1M.pdf
diff --git a/...ts/figures/fig_S8_tsne_BERT_models_1M.png → ...ures/final/fig_S8_tsne_BERT_models_1M.png b/...ts/figures/fig_S8_tsne_BERT_models_1M.png → ...ures/final/fig_S8_tsne_BERT_models_1M.png
diff --git a/results/figures/fig_S9_tsne_vs_umap_1M.pdf → .../figures/final/fig_S9_tsne_vs_umap_1M.pdf b/results/figures/fig_S9_tsne_vs_umap_1M.pdf → .../figures/final/fig_S9_tsne_vs_umap_1M.pdf
diff --git a/results/figures/fig_S9_tsne_vs_umap_1M.png → .../figures/final/fig_S9_tsne_vs_umap_1M.png b/results/figures/fig_S9_tsne_vs_umap_1M.png → .../figures/final/fig_S9_tsne_vs_umap_1M.png
diff --git a/results/figures/final/fig_tsne_and_number_of_papers_by_country_v5.pdf b/results/figures/final/fig_tsne_and_number_of_papers_by_country_v5.pdf
diff --git a/results/figures/final/fig_tsne_and_number_of_papers_by_country_v5.png b/results/figures/final/fig_tsne_and_number_of_papers_by_country_v5.png
diff --git a/results/figures/final/fig_tsne_and_retracted_papers_by_country_v4.pdf b/results/figures/final/fig_tsne_and_retracted_papers_by_country_v4.pdf
diff --git a/results/figures/final/fig_tsne_and_retracted_papers_by_country_v4.png b/results/figures/final/fig_tsne_and_retracted_papers_by_country_v4.png
diff --git a/results/figures/final/fig_updated_general_embedding_v5.pdf b/results/figures/final/fig_updated_general_embedding_v5.pdf
diff --git a/results/figures/final/fig_updated_general_embedding_v5.png b/results/figures/final/fig_updated_general_embedding_v5.png
diff --git a/scripts/00-rgm-data-download.ipynb b/scripts/00-rgm-data-download.ipynb
@@ -2705,7 +2705,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,

diff --git a/scripts/01-rgm-data-parse.ipynb b/scripts/01-rgm-data-parse.ipynb
@@ -1934,7 +1934,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,

diff --git a/scripts/02-ls-data-obtain-BERT-embeddings.ipynb b/scripts/02-ls-data-obtain-BERT-embeddings.ipynb
@@ -696,7 +696,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.11.5"
   },
   "varInspector": {
    "cols": {

diff --git a/scripts/04-rgm-pipeline-TFIDF.ipynb b/scripts/04-rgm-pipeline-TFIDF.ipynb
@@ -690,7 +690,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,

diff --git a/scripts/09-rgm-analysis-covid-19.ipynb b/scripts/09-rgm-analysis-covid-19.ipynb
@@ -81,7 +81,7 @@
     "%load_ext autoreload\n",
     "%autoreload 2\n",
     "\n",
-    "from pubmed_landscape_src.plotting import automatic_coloring"
+    "from pubmed_landscape_src.plotting import improved_coloring"
    ]
   },
   {