update data tab (Crossref Snapshot 08/2024)

subugoe · Sep 11, 2024 · a959a9e · a959a9e
1 parent 34355f5
commit a959a9e
Show file tree

Hide file tree

Showing 7 changed files with 144 additions and 144 deletions.
diff --git a/data.Rmd b/data.Rmd
@@ -25,7 +25,7 @@ Anyone can view and query our publicly available [Open Scholarly Data  warehouse
 
 | Snapshot        | File            | Table               | Schema               | Procedure | Last Changed | Coverage  | Number of rows |
 |-----------------|-----------------|---------------------|----------------------|-----------|--------------|-----------|--------------------|
-|  2024/07        | all.json.tar.gz | [cr_instant.snapshot](https://console.cloud.google.com/bigquery?ws=!1m4!1m3!3m2!1ssubugoe-collaborative!2scr_instant) | schema_crossref.json | [Repo](https://github.com/naustica/crossref_bq) |  13.08.2024  | 2013-2024 | 49.288.254 |
+|  2024/08        | all.json.tar.gz | [cr_instant.snapshot](https://console.cloud.google.com/bigquery?ws=!1m4!1m3!3m2!1ssubugoe-collaborative!2scr_instant) | schema_crossref.json | [Repo](https://github.com/naustica/crossref_bq) |  11.09.2024  | 2013-2024 | 49.874.183 |
 
 :::
 

diff --git a/docs/data.html b/docs/data.html
@@ -2224,14 +2224,14 @@ <h3 id="current-snapshot-cr_instant">Current Snapshot (cr_instant)</h3>
 </thead>
 <tbody>
 <tr class="odd">
-<td>2024/07</td>
+<td>2024/08</td>
 <td>all.json.tar.gz</td>
 <td><a href="https://console.cloud.google.com/bigquery?ws=!1m4!1m3!3m2!1ssubugoe-collaborative!2scr_instant">cr_instant.snapshot</a></td>
 <td>schema_crossref.json</td>
 <td><a href="https://github.com/naustica/crossref_bq">Repo</a></td>
-<td>13.08.2024</td>
+<td>11.09.2024</td>
 <td>2013-2024</td>
-<td>49.288.254</td>
+<td>49.874.183</td>
 </tr>
 </tbody>
 </table>

diff --git a/docs/index.html b/docs/index.html
@@ -2174,145 +2174,145 @@ <h3>${suggestion.title}</h3>
 <!--radix_placeholder_site_before_body-->
 <!--/radix_placeholder_site_before_body-->
 <!--radix_placeholder_article_listing-->
-
-
-<script type="application/javascript">
-
-function init_posts_list() {
-
-  function load_image(img) {
-    var src = $(img).attr('data-src');
-    if (src) {
-      $(img).attr('src', src);
-      $(img).on("load", function() {
-        img.removeAttribute('data-src');
-      });
-    }
-  }
-
-  function set_posts_visible(posts, visible) {
-    if (visible) {
-
-      // show bottom border by default
-      $(posts).removeClass('post-preview-last');
-
-      // apply limits if need be
-      var max_posts = 25;
-      var apply_limits = $('.posts-container').hasClass('posts-apply-limit');
-      if (apply_limits && posts.length > max_posts) {
-        posts = $(posts).slice(0, max_posts);
-      } else {
-        $('.posts-more a').addClass('hidden');
-      }
-
-      // apply last style
-      $(posts.slice(-1)[0]).addClass('post-preview-last');
-
-      $(posts).removeClass('hidden');
-      $(posts).find('img[data-src]').each(function(i, img) {
-        load_image(img);
-      });
-    } else {
-      $(posts).addClass('hidden');
-    }
-  }
-
-  function apply_hash_filter() {
-
-    // clear active state
-    $('.categories .active').removeClass('active');
-
-    // mark all posts invisible to start
-    set_posts_visible($('.posts-list').children('a'), false);
-
-    // if we have a hash filter
-    if (window.location.hash && window.location.hash.startsWith("#category:")) {
-
-      // mark posts that match the category visible
-      var page_category = window.location.hash.replace(/^#category:/, "");
-      page_category = decodeURIComponent(page_category)
-      var posts = $('.post-metadata').map(function(idx, script) {
-        var metadata = $.parseJSON($(script).html());
-        var post = null;
-        $.each(metadata.categories, function(idx, category) {
-          category = category.replace(/ /g,"_");
-          if ((page_category || '').toLowerCase() === "articles" || category === page_category) {
-            post = $(script).parent().get();
-            return false;
-          }
-        });
-        return post;
-      });
-      set_posts_visible(posts, true);
-
-      // mark the hash active
-      $('.categories li>a[href="' + decodeURIComponent(window.location.hash) + '"]').addClass('active');
-
-      // update the list_caption
-      var list_caption = $('.posts-list-caption');
-      var caption = (page_category || '').toLowerCase() === "articles"
-        ? list_caption.attr('data-caption')
-        : ('Category: ' + page_category.replace(/_/g," "));
-      list_caption.text(caption);
-
-    } else {
-
-      // no hash filter, make all posts visible (subject to max display)
-      set_posts_visible($('.posts-list').children(), true);
-
-      // reset list caption
-      var list_caption = $('.posts-list-caption');
-      list_caption.text(list_caption.attr('data-caption'));
-
-
-    }
-  }
-
-  // more articles
-  function apply_post_limits(apply) {
-    if (apply) {
-      $('.posts-container').addClass('posts-apply-limit');
-      $('.posts-more a').removeClass('hidden');
-    } else {
-      $('.posts-container').removeClass('posts-apply-limit');
-      $('.posts-more a').addClass('hidden');
-    }
-  }
-
-  // click handling for tags
-  $('.dt-tag').click(function(ev) {
-    window.location.hash = '#category:' + $(this).text().replace(/ /g, "_");
-    return false;
-  })
-
-  // hash filter handling
-  apply_hash_filter();
-  $(window).on('hashchange',function() {
-    apply_post_limits(true);
-    apply_hash_filter();
-  });
-
-  // more articles link
-  $('.posts-more a').click(function(e) {
-    e.preventDefault();
-    apply_post_limits(false);
-    apply_hash_filter();
-    return false;
-  });
-
-}
-
-</script>
-
-
+
+
+<script type="application/javascript">
+
+function init_posts_list() {
+
+  function load_image(img) {
+    var src = $(img).attr('data-src');
+    if (src) {
+      $(img).attr('src', src);
+      $(img).on("load", function() {
+        img.removeAttribute('data-src');
+      });
+    }
+  }
+
+  function set_posts_visible(posts, visible) {
+    if (visible) {
+
+      // show bottom border by default
+      $(posts).removeClass('post-preview-last');
+
+      // apply limits if need be
+      var max_posts = 25;
+      var apply_limits = $('.posts-container').hasClass('posts-apply-limit');
+      if (apply_limits && posts.length > max_posts) {
+        posts = $(posts).slice(0, max_posts);
+      } else {
+        $('.posts-more a').addClass('hidden');
+      }
+
+      // apply last style
+      $(posts.slice(-1)[0]).addClass('post-preview-last');
+
+      $(posts).removeClass('hidden');
+      $(posts).find('img[data-src]').each(function(i, img) {
+        load_image(img);
+      });
+    } else {
+      $(posts).addClass('hidden');
+    }
+  }
+
+  function apply_hash_filter() {
+
+    // clear active state
+    $('.categories .active').removeClass('active');
+
+    // mark all posts invisible to start
+    set_posts_visible($('.posts-list').children('a'), false);
+
+    // if we have a hash filter
+    if (window.location.hash && window.location.hash.startsWith("#category:")) {
+
+      // mark posts that match the category visible
+      var page_category = window.location.hash.replace(/^#category:/, "");
+      page_category = decodeURIComponent(page_category)
+      var posts = $('.post-metadata').map(function(idx, script) {
+        var metadata = $.parseJSON($(script).html());
+        var post = null;
+        $.each(metadata.categories, function(idx, category) {
+          category = category.replace(/ /g,"_");
+          if ((page_category || '').toLowerCase() === "articles" || category === page_category) {
+            post = $(script).parent().get();
+            return false;
+          }
+        });
+        return post;
+      });
+      set_posts_visible(posts, true);
+
+      // mark the hash active
+      $('.categories li>a[href="' + decodeURIComponent(window.location.hash) + '"]').addClass('active');
+
+      // update the list_caption
+      var list_caption = $('.posts-list-caption');
+      var caption = (page_category || '').toLowerCase() === "articles"
+        ? list_caption.attr('data-caption')
+        : ('Category: ' + page_category.replace(/_/g," "));
+      list_caption.text(caption);
+
+    } else {
+
+      // no hash filter, make all posts visible (subject to max display)
+      set_posts_visible($('.posts-list').children(), true);
+
+      // reset list caption
+      var list_caption = $('.posts-list-caption');
+      list_caption.text(list_caption.attr('data-caption'));
+
+
+    }
+  }
+
+  // more articles
+  function apply_post_limits(apply) {
+    if (apply) {
+      $('.posts-container').addClass('posts-apply-limit');
+      $('.posts-more a').removeClass('hidden');
+    } else {
+      $('.posts-container').removeClass('posts-apply-limit');
+      $('.posts-more a').addClass('hidden');
+    }
+  }
+
+  // click handling for tags
+  $('.dt-tag').click(function(ev) {
+    window.location.hash = '#category:' + $(this).text().replace(/ /g, "_");
+    return false;
+  })
+
+  // hash filter handling
+  apply_hash_filter();
+  $(window).on('hashchange',function() {
+    apply_post_limits(true);
+    apply_hash_filter();
+  });
+
+  // more articles link
+  $('.posts-more a').click(function(e) {
+    e.preventDefault();
+    apply_post_limits(false);
+    apply_hash_filter();
+    return false;
+  });
+
+}
+
+</script>
+
+
 
 <div class="posts-container posts-apply-limit l-page">
 <div class="posts-list">
 <h1 class="posts-list-caption" data-caption="Blog | Scholarly Communication Analytics with R">Blog | Scholarly Communication Analytics with R</h1>
 <a href="posts/openalex_document_types/" class="post-preview">
 <script class="post-metadata" type="text/json">{"categories":[]}</script>
 <div class="metadata">
-<div class="publishedDate">Sept. 3, 2024</div>
+<div class="publishedDate">Sept. 4, 2024</div>
 <div class="dt-authors">
 <div class="dt-author">Nick Haupka</div>
 <div class="dt-author">Sophia Dörner</div>

diff --git a/docs/index.xml b/docs/index.xml
@@ -11,7 +11,7 @@ to publish case-studies rapidely showing how to support data-driven workflows an
 decision-making around scholarly communication in libraries using R.
 </description>
     <generator>Distill</generator>
-    <lastBuildDate>Tue, 03 Sep 2024 00:00:00 +0000</lastBuildDate>
+    <lastBuildDate>Wed, 04 Sep 2024 00:00:00 +0000</lastBuildDate>
     <item>
       <title>Recent Changes in Document type classification in OpenAlex compared to Web of Science and Scopus</title>
       <dc:creator>Nick Haupka</dc:creator>
@@ -20,7 +20,7 @@ decision-making around scholarly communication in libraries using R.
       <link>https://subugoe.github.io/scholcomm_analytics/posts/openalex_document_types</link>
       <description>In June 2024, we published a preprint on the classification of document types in OpenAlex and compared it with the scholarly databases Web of Science, Scopus, PubMed and Semantic Scholar. In this follow-up study, we want to investigate further developments in OpenAlex and compare the results with the proprietary databases Scopus and Web of Science.</description>
       <guid>https://subugoe.github.io/scholcomm_analytics/posts/openalex_document_types</guid>
-      <pubDate>Tue, 03 Sep 2024 00:00:00 +0000</pubDate>
+      <pubDate>Wed, 04 Sep 2024 00:00:00 +0000</pubDate>
       <media:content url="https://subugoe.github.io/scholcomm_analytics/posts/openalex_document_types/distill-preview.png" medium="image" type="image/png" width="1416" height="1250"/>
     </item>
     <item>

diff --git a/docs/posts/posts.json b/docs/posts/posts.json
@@ -22,7 +22,7 @@
     "contents": "\n\n\n\n\n\n\n\n\n\nIn June 2024, we submitted an analysis of publication and document types in OpenAlex in comparison with the proprietary databases Web of Science and Scopus and the open data sources Semantic Scholar and PubMed (Haupka et al. 2024).\nWe found substantial differences between these databases: While Web of Science and Scopus provided a comprehensive set of document types to describe works published in journals, OpenAlex supported only a comparably limited number of types.\nNotably, OpenAlex lacked a distinction between research articles and reviews, which can be crucial when calculating citation indicators.\nIn line with related studies (Alperin et al. 2024), we also observed discrepancies in the number of publications when restricting to certain document types.\nMeanwhile, in late May and late July 2024, OpenAlex introduced extended approaches to obtain publication and document types.\nAmong the four new categories were preprints and reviews. Using PubMed, OpenAlex identified approximately 4 million journal articles as editorials, erratum, letters, preprints, reviews, or retractions.\nOf course, we wanted to know how these improvements affect our findings.\nWe therefore re-applied our approach to the recent changes.\nUsing works published in journals between 2012 and 2022, we demonstrate that OpenAlex’s recent changes provide a more nuanced set of document types to refine scholarly works.\nHowever, the comparison with Web of Science and Scopus reveals that there remain considerable differences.\nData and Methods\nFollowing our preprint, we performed a pairwise comparison of journal publications indexed in OpenAlex with the Web of Science and Scopus published 2012 to 2022.\nTo investigate changes made in OpenAlex, we furthermore compared data from the OpenAlex July 2024 and August 2023 snapshots.\nScopus and Web of Science data were retrieved from the German Competence Network of Bibliometrics, using the April 2024 snapshots.\nWeb of Science data retrieval comprised the Core Collection.\nWe matched items between the databases by DOI after normalisation to lowercase.\nOverall, the intersection of OpenAlex and Scopus covered 24,704,172 and the intersection of OpenAlex and Web of Science covered 21,775,771 records.\nThen, we categorised works based on their document type information into two categories: research discourse and editorial discourse.\nThe research discourse category now also includes publications of type “preprint”, which was added to OpenAlex in May 2024.\nThe mapping tables used for reclassifying the document types can be found in the appendix of Haupka et al. (2024).\nFindings\nFigure 1 illustrates OpenAlex document type changes in comparison with Scopus.\nBefore the introduction of the more nuanced set of document types, OpenAlex tagged\n24,559,634 items (99.42%) as articles, which reduced to 22,132,347 (89.59%).\nScopus tagged 20,777,473 items (84.11%) as article.\nOpenAlex assigned the type review to 1,511,172 items (6.12%), whereas Scopus to 1,776,555 items (7.19%).\n\n\n\n\nFigure 1: Comparison of OpenAlex and Scopus for publication years 2012-2022\n\n\n\nFigure 2 illustrates the same for the comparison of OpenAlex with Web of Science.\nHere, OpenAlex tagged 21,673,833 items (99.53%) as articles before the introduction of the more nuanced set of document types and 19,500,710 (89.55%) after.\nIn Web of Science 17,266,997 items (79.29%) were tagged as articles.\nThe document type review is assigned to 1,362,290 items (6.26%) by OpenAlex, whereas Web of Science tagged 1,242,472 items (5.71%) as such.\n\n\n\n\nFigure 2: Comparison of OpenAlex and Web of Science for publication years 2012-2022\n\n\n\nOverall, Figures 1 and 2 demonstrate that even after the introduction of a more nuanced set of document types, OpenAlex still tags a higher proportion of items as articles than the commercial data sources.\nThe difference between the proportions of items tagged as articles is, however, slightly more pronounced in the comparison of OpenAlex with Web of Science.\nScopus tags a higher proportion of items as reviews and both Scopus and Web of Science still tag more items as editorial content than OpenAlex.\nIn sum, 340,998 (Scopus) and 656,366 (Web of Science) items are tagged as editorial/editorial material or letters in Scopus and Web of Science, respectively, while tagged as articles in OpenAlex.\nWhen grouping the document types into the two categories research discourse and editorial discourse, we found that even after the introduction of a more nuanced set of document types in OpenAlex, the proportion of items labelled as editorial discourse is still about 3% lower compared to Scopus and Web of Science, as shown in the tables below.\n\n\n\n\n\n\n\n\n\n\n\n\nDiscussion and Outlook\nOur updated analysis demonstrated a noticable improvement of the classification of document types in OpenAlex when comparing it to Scopus and Web of Science.\nCompared to data from 2023, the discrepancy in the classification of items has decreased slightly.\nThis indicates a convergence of the classification system in OpenAlex towards those from proprietary databases, with an enhanced coverage of reviews and editorial materials.\nIn addition, the rule-based string matching for recognising paratexts introduced and revised by OpenAlex resulted in more texts being categorised as editorial material than before.\nHowever, the results also show that the curation of document types has not yet been finalised.\nConclusively, we would like to point out that there is no correct classification system per se.\nRather different classification systems applied by the database operators can bring advantages and disadvantages.\nIn Semantic Scholar and PubMed, for example, publications are labelled as clinical studies and case reports, which in Scopus, Web of Science and OpenAlex are predominantly assigned to the document type article.\nA differentiation of these publications has the potential to increase the quality of bibliometric surveys in the analysed databases.\nAlso, the results from this analysis are only partially comparable with the results from our preprint, as in the preprint we worked with a more restrictive set that included publications from Semantic Scholar and PubMed.\nFunding\nThis work is funded by the Bundesministerium für Bildung und Forschung (BMBF) project KBOPENBIB (16WIK2301E). We acknowledge the support of the German Competence Center for Bibliometrics.\n\n\n\nAlperin, Juan Pablo, Jason Portenoy, Kyle Demes, Vincent Larivière, and Stefanie Haustein. 2024. “An Analysis of the Suitability of OpenAlex for Bibliometric Analyses.” arXiv. https://doi.org/10.48550/arXiv.2404.17663.\n\n\nHaupka, Nick, Jack H. Culbert, Alexander Schniedermann, Najko Jahn, and Philipp Mayr. 2024. “Analysis of the Publication and Document Types in OpenAlex, Web of Science, Scopus, PubMed and Semantic Scholar.” https://arxiv.org/abs/2406.15154.\n\n\n\n\n",
     "preview": "posts/openalex_document_types/distill-preview.png",
     "last_modified": "2024-09-04T15:29:30+02:00",
-    "input_file": "openalex_document_types_2024.knit.md",
+    "input_file": {},
     "preview_width": 1416,
     "preview_height": 1250
   },