Add notebooks and processed results folder

ucinlp · May 1, 2024 · 9bfab13 · 9bfab13
1 parent 8e9d3cd
commit 9bfab13
Show file tree

Hide file tree

Showing 20 changed files with 10,592 additions and 0 deletions.
diff --git a/notebooks/0.Preprocess-baselines.ipynb b/notebooks/0.Preprocess-baselines.ipynb
@@ -0,0 +1,399 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0bf16f16",
+   "metadata": {},
+   "source": [
+    "## Preprocessing of baselines' original files\n",
+    "\n",
+    "\n",
+    "In this notebook, we preprocess the original files to make it applicable in our subset. Since, we focus on gender and the subset of the pronouns, most modifications will concern selecting the appropriate evaluation subset and ensuring that there's a common structure to the dataframes, including 'sentence', 'template', etc."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c9d424b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# directory where we place the original raw files from WinoBias and Winogender\n",
+    "ORIG_FILES = \"../data\"\n",
+    "# directory containing an intermediate version of the files\n",
+    "RAW_DIR = \"../results-baselines\"\n",
+    "# directory containing the final version of the files\n",
+    "# (the ones that are actually referred to in the evaluation scripts)\n",
+    "PREPROC_DIR = \"../results-baselines/final-results\"\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os, json\n",
+    "os.makedirs(RAW_DIR, exist_ok=True)\n",
+    "os.makedirs(PREPROC_DIR, exist_ok=True)\n",
+    "\n",
+    "# The notebooks folder should be at the same level as the code folder...\n",
+    "import sys; sys.path.append(\"../src\")\n",
+    "from run_pipeline import parse_replace_placeholders"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8174382e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"../configs/placeholders.json\") as f:\n",
+    "    PLACEHOLDERS = json.load(f)\n",
+    "    \n",
+    "PLACEHOLDERS"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a7aba9c6",
+   "metadata": {},
+   "source": [
+    "### Winobias\n",
+    "\n",
+    "Proposed by Zhao et al 2018, roughly around the same time as WinoGender, comprises two types of coreference resolution examples. The first type, called Type 1, concerns the examples whose pronoun disambiguation requires implicit world knowledge and has no cues in the syntax or semantics of the example. The second type, called Type 2, is the easier set of coreference resolution examples, since syntax and semantics can help disambiguate the correct pronoun.\n",
+    "\n",
+    "\n",
+    "**Note**: ~~We do not need to download both anti-stereotypical and stereotypical associations because they are \"symmetrical\". That is, replacing the pronoun with the opposite template, would result in the stereotypical association.~~(Edit: Actually, we do need to download every file, since in some cases, we will find ourselves with sentences using pronouns \"her\" and we won't know which male pronoun to replace it with. Instead of wasting ChatGPT resources running this, we will process every file and remove duplicates in the end.)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0aa59978",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re, glob\n",
+    "from collections import defaultdict\n",
+    "\n",
+    "WINOBIAS_REGEX = r\"(?P<entity>\\[.+?\\]).+?(?P<pronoun>\\[.+?\\])\"\n",
+    "WINOBIAS_PATTERN = re.compile(WINOBIAS_REGEX)\n",
+    "\n",
+    "def get_words(sentence: str, pattern=WINOBIAS_PATTERN):\n",
+    "    match = pattern.search(sentence)\n",
+    "    \n",
+    "    attribute = match.group(\"entity\")\n",
+    "    target = match.group(\"pronoun\")\n",
+    "    return attribute, target\n",
+    "\n",
+    "def read_winobias(path: str):\n",
+    "    results = defaultdict(list)\n",
+    "    with open(path) as f:\n",
+    "        for l in f.readlines():\n",
+    "            l = l.strip()\n",
+    "            l = re.sub(r\"^[0-9]{1,3} \", \"\", l)\n",
+    "            \n",
+    "            attr, target = get_words(l)\n",
+    "            \n",
+    "            l = l.replace(attr, attr[1:-1])\n",
+    "            l = l.replace(target, target[1:-1])\n",
+    "            \n",
+    "            results[\"sentence\"].append(l)\n",
+    "            results[\"word\"].append(attr[1:-1])\n",
+    "            results[\"target_word\"].append(target[1:-1])\n",
+    "            \n",
+    "            for expr in (\"she\", \"her\", \"hers\", \"herself\"):\n",
+    "                if target[1:-1].lower() == expr:\n",
+    "                    results[\"drop\"].append(True) # mark female results to drop\n",
+    "                    break\n",
+    "            else:\n",
+    "                results[\"drop\"].append(False)\n",
+    "            \n",
+    "    results = pd.DataFrame(results)\n",
+    "    # Drop female results\n",
+    "    results = results[~results[\"drop\"]]\n",
+    "    \n",
+    "    # Add information about the original file\n",
+    "    filename = path.rpartition(\"/\")[-1]\n",
+    "    results[\"filename\"] = filename\n",
+    "    \n",
+    "    results[\"stereotype\"] = \"pro_stereotyped\" in filename\n",
+    "    results[\"is_challenging\"] = \"type1\" in filename\n",
+    "    results[\"is_dev\"] =  \".dev\" in filename\n",
+    "        \n",
+    "    return pd.DataFrame(results)\n",
+    "\n",
+    "\n",
+    "for SUFFIX in (\".dev\", \".test\"):\n",
+    "    # List all filepaths in the directory\n",
+    "    FILEPATHS = glob.glob(f\"{ORIG_FILES}/winobias-zhao-2018/*.txt{SUFFIX}\")\n",
+    "    # Merge all the examples in dev, regardless of the type\n",
+    "    winobias = pd.concat([read_winobias(fp) for fp in FILEPATHS]).sort_values(\"sentence\").reset_index(drop=True)\n",
+    "    # Parse the templates, creating a template and determining whether the necessary pronouns appear.\n",
+    "    winobias_has_pronoun, winobias_template =  parse_replace_placeholders(\n",
+    "        winobias[\"sentence\"].values.tolist(),\n",
+    "        PLACEHOLDERS[\"gender_to_placeholder\"],\n",
+    "    )\n",
+    "    # Add information to the original file\n",
+    "    winobias.insert(len(winobias.columns), \"has_pronoun\", winobias_has_pronoun)\n",
+    "    winobias.insert(len(winobias.columns), \"template\", winobias_template)\n",
+    "    assert winobias[\"has_pronoun\"].all(), \"Some templates did not have a pronoun replaced\"\n",
+    "    winobias.to_csv(f\"{RAW_DIR}/coref__Winobias__templates{SUFFIX}.csv\")\n",
+    "winobias.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b8755a84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(f\"{RAW_DIR}/coref__Winobias__templates.dev.csv\", index_col=0)\n",
+    "# let's drop the article\n",
+    "df[\"word\"] = df[\"word\"].apply(lambda x: x.split()[-1]).apply(str.lower)\n",
+    "df.to_csv(f\"{PREPROC_DIR}/coref__Winobias__templates.dev.csv\")\n",
+    "\n",
+    "\n",
+    "df = pd.read_csv(f\"{RAW_DIR}/coref__Winobias__templates.test.csv\", index_col=0)\n",
+    "# let's drop the article\n",
+    "df[\"word\"] = df[\"word\"].apply(lambda x: x.split()[-1]).apply(str.lower)\n",
+    "df.to_csv(f\"{PREPROC_DIR}/coref__Winobias__templates.test.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "edacd719",
+   "metadata": {},
+   "source": [
+    "### Winogender"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0047115b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def canonical_sentid(sentid: str) -> str:\n",
+    "    \"\"\"Given the sentid field in the original Winogender files, strip them.\"\"\"\n",
+    "    for exp in (\".male.txt\", \".female.txt\", \".neutral.txt\"):\n",
+    "        if sentid.endswith(exp):\n",
+    "            return sentid[:-len(exp)]        \n",
+    "    return sentid\n",
+    "\n",
+    "winogender = pd.read_csv(f\"{ORIG_FILES}/winogender-rudinger-2018/all_sentences.csv\")\n",
+    "winogender.insert(1, \"example_id\", winogender[\"sentid\"].apply(canonical_sentid))\n",
+    "winogender.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4cc7997f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Since the sentences are the same, only changing the completion, drop all but the first.\n",
+    "winogender_subset = winogender.groupby(\"example_id\").head(1)\n",
+    "# Create template from each sentence using the placeholders\n",
+    "winogender_has_pronoun, winogender_template =  parse_replace_placeholders(\n",
+    "    winogender_subset[\"sentence\"].values.tolist(),\n",
+    "    PLACEHOLDERS[\"gender_to_placeholder\"],\n",
+    ")\n",
+    "\n",
+    "# Create columns 'has_pronoun', 'template'\n",
+    "winogender_subset.insert(len(winogender_subset.columns), \"has_pronoun\", winogender_has_pronoun)\n",
+    "winogender_subset.insert(len(winogender_subset.columns), \"template\", winogender_template)\n",
+    "assert winogender_subset[\"has_pronoun\"].all(), \"Some templates did not have a pronoun replaced\"\n",
+    "winogender_subset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6be687d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Store\n",
+    "winogender_subset.to_csv(f\"{RAW_DIR}/coref__Winogender__templates.csv\", index=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c457e314",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(f\"{RAW_DIR}/coref__Winogender__templates.csv\", index_col=0)\n",
+    "# let's derive the word column\n",
+    "def obtain_word_winogender(example_id):\n",
+    "    e1, e2, idx = example_id.split(\".\")\n",
+    "    return (e2 if idx == \"1\" else e1).lower()\n",
+    "\n",
+    "df[\"word\"] = df[\"example_id\"].apply(obtain_word_winogender)\n",
+    "df.to_csv(f\"{PREPROC_DIR}/coref__Winogender__templates.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3ddf778",
+   "metadata": {},
+   "source": [
+    "### Stereoset\n",
+    "\n",
+    "Even though Stereoset has two target words as \"herself\" (which wouldn't be difficult to disambiguate), the templates themselves have some pronouns. We have to create placeholders for these different cases."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ead517a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(f\"{RAW_DIR}/lm__StereoSet_pronouns_only.csv\")\n",
+    "df = df.drop_duplicates()\n",
+    "df.groupby(\"target_word\").count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "720b558e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"sentence\"].nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd356491",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We need to update the templates in herself\n",
+    "# -- since they are few, i'll do it manually :(\n",
+    "# -- we fixed the grammar on one example with stereoset id 3d2572d81efcfdb0829aea957f238b3f\n",
+    "df.drop(\"target_word\",axis=1).to_csv(f\"{PREPROC_DIR}/lm__StereoSet_pronouns_only.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a02e046b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(f\"{PREPROC_DIR}/lm__StereoSet_pronouns_only.csv\")\n",
+    "df = df.drop_duplicates()\n",
+    "len(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d56448cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv(f\"{PREPROC_DIR}/lm__StereoSet_pronouns_only.csv\", index=None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fc74d6a6",
+   "metadata": {},
+   "source": [
+    "### CrowS pairs\n",
+    "\n",
+    "We perform the following transformations to the original dataset:\n",
+    "\n",
+    "1. Restrict to the bias_type 'gender'\n",
+    "2. Keep the examples with more 60% agreement in terms of the bias type of the examples.\n",
+    "3. Since the sentences are minimally edited in the demographics, we will also try to check for the presence of male pronouns. Note that our code by default looks for male pronouns he/his/him only. However, because these examples reflect binary gender modifications, if our method fails to detect male pronoun in 'sent_more' because it was a female completion, then we assume that the 'sent_less' (which would be the male completion) should have the male pronouns. \n",
+    "\n",
+    "**Note**: Not all the resulting sentences are equally likely under both genders, due to the presence of gendered lexicon, like in sentences \"As a woman, {SHE/HE} felt empowered ...\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a0ea2e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RAW_CROWS = pd.read_csv(f\"{BASE_DIR}/results-baselines/lm__CrowS.csv\", index_col=0)\n",
+    "len(RAW_CROWS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7bc013c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json, sys; sys.path.append(\"../code\")\n",
+    "from run_pipeline import parse_replace_placeholders\n",
+    "\n",
+    "# gender-bias subset\n",
+    "RAW_CROWS = RAW_CROWS[RAW_CROWS[\"bias_type\"] == \"gender\"]\n",
+    "\n",
+    "# keep examples w/ \"good agreement\"\n",
+    "annotations = RAW_CROWS[\"annotations\"].apply(lambda x: [annot == ['gender'] for annot in eval(x)])\n",
+    "# Note: we want at least 4 annotations and 3 of them should agree\n",
+    "b = annotations.apply(lambda x: sum(x) / len(x) > 0.60)\n",
+    "\n",
+    "sents_more = RAW_CROWS[\"sent_more\"].values.tolist()\n",
+    "sents_less = RAW_CROWS[\"sent_less\"].values.tolist()\n",
+    "\n",
+    "# What happens is that these are \n",
+    "has_pronoun_more, template_more = parse_replace_placeholders(sents_more, PLACEHOLDERS[\"gender_to_placeholder\"])\n",
+    "has_pronoun_less, template_less = parse_replace_placeholders(sents_less, PLACEHOLDERS[\"gender_to_placeholder\"])\n",
+    "\n",
+    "mask = (np.array(has_pronoun_more) | np.array(has_pronoun_less))\n",
+    "RAW_CROWS[mask]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c634aead",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20074446",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}