From 5a4d12b9870452aebdd8ff94b0cbaea4d13928d1 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 18 Oct 2023 14:22:12 -0700
Subject: [PATCH] add springer_materials notebook

---
 .../pycroscopy.ipynb                          |  10 +
 .../springer_materials.ipynb                  | 333 ++++++++++++++++++
 2 files changed, 343 insertions(+)
 create mode 100644 mpcontribs-portal/notebooks/contribs.materialsproject.org/springer_materials.ipynb

diff --git a/mpcontribs-portal/notebooks/contribs.materialsproject.org/pycroscopy.ipynb b/mpcontribs-portal/notebooks/contribs.materialsproject.org/pycroscopy.ipynb
index 595d6031f..99f10a538 100644
--- a/mpcontribs-portal/notebooks/contribs.materialsproject.org/pycroscopy.ipynb
+++ b/mpcontribs-portal/notebooks/contribs.materialsproject.org/pycroscopy.ipynb
@@ -166,6 +166,16 @@
     "client.delete_contributions()\n",
     "client.submit_contributions(contributions)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "varied-condition",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.make_public()"
+   ]
   }
  ],
  "metadata": {
diff --git a/mpcontribs-portal/notebooks/contribs.materialsproject.org/springer_materials.ipynb b/mpcontribs-portal/notebooks/contribs.materialsproject.org/springer_materials.ipynb
new file mode 100644
index 000000000..941be79b3
--- /dev/null
+++ b/mpcontribs-portal/notebooks/contribs.materialsproject.org/springer_materials.ipynb
@@ -0,0 +1,333 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "native-patrick",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import re\n",
+    "from glob import glob\n",
+    "from mpcontribs.client import Client\n",
+    "from flatten_dict import unflatten, flatten"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "affecting-smooth",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a project - only needed once\n",
+    "# client = Client()\n",
+    "# client.create_project(\n",
+    "#     name=\"springer_materials\",\n",
+    "#     title=\"Springer Materials\",\n",
+    "#     authors=\"S. Scherer, S. George, P. Huck\",\n",
+    "#     description=\"Linus Pauling Files from Springer Materials\",\n",
+    "#     url=\"https://materials.springer.com\"\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "strange-living",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# init client and update project info if needed\n",
+    "client = Client(project=\"springer_materials\")\n",
+    "# client.make_public()  # needs approval\n",
+    "# client.update_project(update={\"unique_identifiers\": False})  # allow multiple contributions per identifier/mpid\n",
+    "# client.update_project(update={\"other\": {  # functions as a legend for root-level fields\n",
+    "#     \"springer\": \"main info about springer entry\",\n",
+    "#     \"properties\": \"meta data and availability of property entries\",\n",
+    "#     \"phasediagram\": \"meta data about phase diagram entries\"\n",
+    "# }})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "reverse-label",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load data\n",
+    "data_dir = \"/Users/patrick/GoogleDriveLBNL/My Drive/MaterialsProject/gitrepos/mpcontribs-data/springer_materials\"\n",
+    "\n",
+    "data = {}\n",
+    "for p in glob(f\"{data_dir}/*.json\"):\n",
+    "    if not p.endswith(\"_example.json\"):\n",
+    "        with open(p) as f:\n",
+    "            k = p.rsplit(\"/\", 1)[-1]\n",
+    "            data[k] = json.load(f)\n",
+    "\n",
+    "keys = set(k for docs in data.values() for doc in docs for k in doc)\n",
+    "len(data), len(keys)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "boxed-shade",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# define map for column names and their units\n",
+    "columns_map = {\n",
+    "    # common fields/columns\n",
+    "    \"Document_ID\": {\"name\": \"springer.id\"},\n",
+    "    \"Document_Title\": {\"name\": \"springer.title\"},\n",
+    "    \"Element_System\": {\"name\": \"springer.chemsys\"},\n",
+    "    \"ISP_Distinct_Solid_Phase\": {\"name\": \"springer.phase\"},\n",
+    "    \"Release_Year\": {\"name\": \"springer.released\", \"unit\": \"\"},\n",
+    "    \"URL\": {\"name\": \"springer.url\"},\n",
+    "    # properties\n",
+    "    \"Prototype\": {\"name\": \"properties.prototype\"},\n",
+    "    \"Pearson_Symbol\": {\"name\": \"properties.pearson\"},\n",
+    "    \"Space_Group_Symbol\": {\"name\": \"properties.spacegroup\"},\n",
+    "    \"Sample_Form\": {\"name\": \"properties.sample\"},\n",
+    "    \"Main_Physical_Property\": {\"name\": \"properties.main\"},\n",
+    "    \"Number_of_DataPoints\": {\"name\": \"properties.stats.datapoints\", \"unit\": \"\"},\n",
+    "    \"Number_of_Samples\": {\"name\": \"properties.stats.samples\", \"unit\": \"\"},\n",
+    "    \"Number_of_References\": {\"name\": \"properties.stats.references\", \"unit\": \"\"},\n",
+    "    # phase diagram\n",
+    "    \"Composition\": {\"name\": \"phasediagram.composition\"},\n",
+    "    \"Temperature\": {\"name\": \"phasediagram.temperature\", \"unit\": \"K\"},\n",
+    "    \"Status_of_Phase_Diagram\": {\"name\": \"phasediagram.status\"}\n",
+    "}\n",
+    "\n",
+    "keys - set(columns_map.keys()) # just making sure I didn't miss a key"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fifty-parallel",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# prep contributions\n",
+    "contributions = []\n",
+    "prop_set = set()\n",
+    "special_char_map = {ord('ä'): 'ae', ord('ü'): 'ue', ord('ö'): 'oe', ord('ß'): 'ss'}\n",
+    "CLEANR = re.compile('<.*?>') \n",
+    "\n",
+    "def convert_prop(s):\n",
+    "    cleaned = \"\".join([c if c.isalnum() else \" \" for c in s])\n",
+    "    capitalized = \"\".join([w.capitalize() for w in cleaned.split()])\n",
+    "    return capitalized.translate(special_char_map)\n",
+    "\n",
+    "def cleanhtml(raw_html):\n",
+    "    return re.sub(CLEANR, '', raw_html)\n",
+    "\n",
+    "for fn, docs in data.items():\n",
+    "    category = \"-\".join(fn.replace(\".json\", \"\").rsplit(\"_\", 2)[1:])\n",
+    "    print(category)\n",
+    "    for doc in docs:\n",
+    "        identifier = doc[\"MaterialsProject_ID\"]\n",
+    "        formula = doc[\"Molecular_Formula\"]\n",
+    "        properties = [\n",
+    "            convert_prop(prop)\n",
+    "            for prop in sorted(doc[\"List_of_Physical_Properties\"])\n",
+    "        ]\n",
+    "        contrib = {\n",
+    "            \"identifier\": identifier, \"formula\": formula,\n",
+    "            \"data\": {\"springer.category\": category},\n",
+    "        }\n",
+    "        \n",
+    "        if properties:\n",
+    "            prop_set |= set(properties)\n",
+    "            for prop in properties:\n",
+    "                contrib[\"data\"][f\"properties.available.{prop}\"] = \"Yes\"\n",
+    "\n",
+    "        for k, v in doc.items():\n",
+    "            if v:\n",
+    "                col = columns_map.get(k)\n",
+    "                if col:\n",
+    "                    name = col.get(\"name\")\n",
+    "                    if name:\n",
+    "                        unit = col.get(\"unit\")\n",
+    "                        val = \",\".join(v) if isinstance(v, list) else v\n",
+    "                        if unit is None and \"<\" in val:\n",
+    "                            val = cleanhtml(val)\n",
+    "\n",
+    "                        contrib[\"data\"][name] = f\"{val} {unit}\" if unit else val \n",
+    "                    \n",
+    "        contrib[\"data\"] = unflatten(contrib[\"data\"], splitter=\"dot\")\n",
+    "        contributions.append(contrib)\n",
+    "            \n",
+    "len(contributions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "crude-incident",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# init columns\n",
+    "columns = {v[\"name\"]: v.get(\"unit\") for v in columns_map.values()}\n",
+    "columns[\"springer.category\"] = None\n",
+    "\n",
+    "for prop in sorted(prop_set):\n",
+    "    columns[f\"properties.available.{prop}\"] = None\n",
+    "    \n",
+    "client.init_columns(columns)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "taken-drama",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# submit everything\n",
+    "client.delete_contributions()  # need to delete first due to `unique_identifiers=False`\n",
+    "client.init_columns(columns)  # good practice :)\n",
+    "client.submit_contributions(contributions)\n",
+    "client.init_columns(columns) # just to make sure that all columns show up in the intended order"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "digital-approval",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# list of available query parameters for this project\n",
+    "client._reinit()  # might be needed if new data was just submitted\n",
+    "client.available_query_params(startswith=(\"data__springer__released\", \"formula\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "later-discharge",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# count contributions for query:\n",
+    "# - \"physical-properties\" category\n",
+    "# - \"elasticity\" as main property\n",
+    "# - more than 5 samples\n",
+    "query = {\n",
+    "    \"data__springer__category__exact\": \"physical-properties\",\n",
+    "    \"data__properties__main__exact\": \"elasticity\",\n",
+    "    \"data__properties__stats__samples__value__gt\": 5\n",
+    "}\n",
+    "client.count(query=query)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "neither-moore",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# retrieve contributions for query and project out Springer ID and spacegroup fields\n",
+    "fields = [\"id\", \"identifier\", \"data.springer.id\", \"data.properties.spacegroup\"]\n",
+    "client.query_contributions(query=query, fields=fields)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "proud-certificate",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get mp-id (and other info if needed) from Springer ID\n",
+    "springer_id = \"ppp_350781a8aa14dc0b19c6c879daff3be2\"\n",
+    "client.query_contributions(\n",
+    "    query={\"data__springer__id__exact\": springer_id},\n",
+    "    fields=[\"id\", \"identifier\", \"data.springer.id\", \"data.properties.pearson\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "protecting-guidance",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# count all entries for a list of formulas released before 2023\n",
+    "client.count(query={\n",
+    "    \"formula__in\": [\"Fe2O3\", \"GaAS\"], \"data__springer__released__value__lt\": 2023\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "detected-pricing",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get all entries containing all selected properties\n",
+    "properties = [\"XRayDiffraction\", \"IsotropicDisplacementParameter\", \"AnisotropicDisplacementParameter\"]\n",
+    "query = {f\"data__properties__available__{prop}__exact\": \"Yes\" for prop in properties}\n",
+    "\n",
+    "client.query_contributions(\n",
+    "    query=query,\n",
+    "    fields=[\"id\", \"identifier\", \"data.springer.id\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "august-farming",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# query/code to show Springer URLs and available properties under \"External Links\" on MP Details Page\n",
+    "query = {\n",
+    "    \"identifier\": \"mp-2534\",\n",
+    "    \"data__springer__category__exact\": \"physical-properties\",\n",
+    "}\n",
+    "fields = [\"data.springer.url\", \"data.properties.available\"]\n",
+    "entries = client.query_contributions(query=query, fields=fields).get(\"data\")\n",
+    "\n",
+    "# mimick table\n",
+    "for entry in entries:\n",
+    "    flat = flatten(entry, reducer=\"dot\")\n",
+    "    url = flat.pop(\"data.springer.url\")\n",
+    "    print(url)\n",
+    "    properties = [k.split(\".\")[-1] for k in flat]\n",
+    "    print(\"\\t\", \", \".join(properties))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}