querying_and_visualizing_wikidata.py

# -*- coding: utf-8 -*-
"""Querying and Visualizing Wikidata.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Cvft2KhJm2XZVBI_cV8xtzQoqC-s42So

# Querying and Visualizing Wikidata

This script walks through using Python to query Wikidata with SPARQL and visualize the results with Panda

Created by Jay Winkler, Alex Wermer-Colan, Synatra Smith, and Rebecca Bayek

# Installing Libraries
"""

# Commented out IPython magic to ensure Python compatibility.
!pip install SPARQLWrapper
# %load_ext google.colab.data_table 
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
import requests
import json
!pip install -U plotly
import plotly.express as px
import numpy as np
import plotly.graph_objects as go

"""# Creating a Wikidata Query in SPARQL

The following query is designed to capture a few key pieces of information about every artist with a US birthplace. The third line of the where statement asks WDQS to include every artist that is from a US state, a US city, or the United States item itself.
"""

sparql.setQuery("""
SELECT
    ?artist ?artistLabel ?sexGenderLabel ?sexualOrientationLabel ?ethnicityLabel
    (group_concat(DISTINCT(?birthPlaceLabel);separator=", ") as ?birthPlaces)
    
WHERE
{
    ?artist wdt:P106 wd:Q483501 .
    ?artist wdt:P19 ?birthPlace .
    { ?birthPlace wdt:P31/wdt:P279* wd:Q35657. } UNION { ?birthPlace wdt:P31/wdt:P279* wd:Q1093829. } UNION { ?birthPlace wdt:P19 wd:Q30. }
    OPTIONAL { ?artist wdt:P21 ?sexGender. }
    OPTIONAL { ?artist wdt:P91 ?sexualOrientation. }
    OPTIONAL { ?artist wdt:P172 ?ethnicity. }
    SERVICE wikibase:label { 
    bd:serviceParam wikibase:language "en". 
    ?artist rdfs:label ?artistLabel . 
    ?sexGender rdfs:label ?sexGenderLabel .
    ?birthPlace rdfs:label ?birthPlaceLabel .
    ?sexualOrientation rdfs:label ?sexualOrientationLabel .
    ?ethnicity rdfs:label ?ethnicityLabel .
  }
}
GROUP BY ?artist ?artistLabel ?sexGenderLabel ?sexualOrientationLabel ?ethnicityLabel
ORDER BY ?artistLabel
""")

sparql.setReturnFormat(JSON)
results = sparql.query().convert()

"""# Creating Dataframe"""

data = pd.json_normalize(results['results']['bindings'])

cols = ['artist.value', 
            'artistLabel.value',
            'sexGenderLabel.value',
            'sexualOrientationLabel.value',
            'ethnicityLabel.value',
            'birthPlaces.value'
          ]
data[cols]

"""### Data Wrangling

Stripping QNumber from Wikidata URL, Adding QNumber to Own Column
"""

new = data["artist.value"].str.split("/", n = -1, expand = True)
data["QNum"] = new[4]

#Stripping the QNumber adds several columns that simply display the datatype or language, re-declaring which columns are actually needed makes the data a little easier to work with.
new_cols = ['artist.value', 
            'artistLabel.value',
            'sexGenderLabel.value',
            'sexualOrientationLabel.value',
            'ethnicityLabel.value',
            'birthPlaces.value',
            'QNum'
          ]
data = data[new_cols]
# data

"""### Querying Wikidata API 

This queries for the first timestamp of each artist
"""

S = requests.Session()

URL = "https://www.wikidata.org/w/api.php"

finalDate = []

for item in data["QNum"]:

  PARAMS = {
      "action": "query",
      "format": "json",
      "prop": "revisions",
      "titles": item,
      "rvprop": "timestamp",
      "rvlimit": "1",
      "rvdir": "newer"
  }

  R = S.get(url=URL, params=PARAMS)
  RESULTS = R.json()

  PAGE_DICT = RESULTS["query"]["pages"]
  for key, value in PAGE_DICT.items() :
      ID_NUM = key

#Stripping just the date, the query automatically includes the time which is unnecessary.
  DATETIME = RESULTS['query']['pages'][ID_NUM]['revisions'][0]['timestamp']
  DATE = DATETIME.split("T")[0]
  finalDate.append(DATE)

data["DateAdded"] = finalDate
# data

"""### Dataframe Manipulation"""

#Importing dateutil and retyping the date column.
import dateutil
data['DateAdded'] = data['DateAdded'].apply(dateutil.parser.parse, dayfirst=False)

#Grabbing "African American" directly from the data as a variable. 
a_a = data['ethnicityLabel.value'][10]

#Replacing NaN with "None" in the ethnicityLabel.value column.
new_eth = data["ethnicityLabel.value"].fillna("None")
data["ethnicityLabel.value"] = new_eth

# data

#Creating a "none" variable direct from the data, similar to the a_a variable above.
nan = data['ethnicityLabel.value'][225]

#Replacing all ethnicities other than "African American" and "None" with "Other"
keep = [a_a, nan]
eth_concat = data['ethnicityLabel.value'].where(data['ethnicityLabel.value'].isin(keep), "Other")
data['ethnicityLabel.value'] = eth_concat

# data

#Sorting by date, resetting index.

data = data.sort_values(by='DateAdded')
data = data.reset_index()
# data

"""This next section creates columns that track change over time for the percenteage of each ethnic group represented in Wikidata."""

#First, a list of all the values in my ethnicityLabel.value column is created. Then empty lists are created to hold each of the percentages that are calculated below. Then declare n as 0.
eth_list = data['ethnicityLabel.value'].tolist()
a_a_pct = []
other_pct = []
nan_pct = []
n = 0

#Here python iterates through eth_list and recalculates the percentage represented by each ethnic group after every item, then adds that percentage to each respective list.
for item in eth_list:
  n = n + 1
  a_a_count = eth_list[0:n].count(a_a)
  a_a_app = a_a_count / n
  a_a_pct.append(a_a_app)
  other_count = eth_list[0:n].count('Other')
  other_app = other_count / n
  other_pct.append(other_app)
  nan_count = eth_list[0:n].count(nan)
  nan_app = nan_count / n
  nan_pct.append(nan_app)

#Add each of the lists to the dataframe as a column.

data['AfricanAmericanPct'] = a_a_pct
data['OtherPct'] = other_pct
data['NonePct'] = nan_pct
# data

"""# Data Visualization

Create the figure of All Artists in Plotly Graphic Objects
"""

#Create the figure.
fig_main = go.Figure()

#Add each line as a trace.
fig_main.add_trace(go.Scatter(
    x=data['DateAdded'],
    y=data['AfricanAmericanPct'],
    name="African American Ethnicity Statement"
))
fig_main.add_trace(go.Scatter(
    x=data['DateAdded'],
    y=data['OtherPct'],
    name="Non-African American Ethnicity Statement"
))
fig_main.add_trace(go.Scatter(
    x=data['DateAdded'],
    y=data['NonePct'],
    name="No Ethnicity Statement"
))

#Format title, adjust axis guidelines to better work with the data.
fig_main.update_layout(title="All Artists",
                       yaxis=dict(tickformat=".1%"))

fig_main.update_yaxes(title="Percentage of Artists",
                      tick0=0.1,
                      dtick=0.2)

fig_main.show()

"""## Refining DataFrame"""

#Declare a Philadelphia variable. 
gritty = ['Philadelphia']

#Create a new Dataframe that only includes the rows whose birthPlaces.value match the variable. Declare the other columns need as well.
phi_data = data[data['birthPlaces.value'].isin(gritty)]
phi_cols = ['artist.value', 
            'artistLabel.value',
            'sexGenderLabel.value',
            'sexualOrientationLabel.value',
            'ethnicityLabel.value',
            'birthPlaces.value',
            'QNum',
            'DateAdded'
          ]
phi_data = phi_data[phi_cols]
phi_data

"""Repeating the Process of gathering percentages and adding them to the new DataFrame"""

ph_eth_list = phi_data['ethnicityLabel.value'].tolist()
ph_a_a_pct = []
ph_other_pct = []
ph_nan_pct = []
n = 0

for item in ph_eth_list:
  n = n + 1
  ph_a_a_count = ph_eth_list[0:n].count(a_a)
  ph_a_a_app = ph_a_a_count / n
  ph_a_a_pct.append(ph_a_a_app)
  ph_other_count = ph_eth_list[0:n].count('Other')
  ph_other_app = ph_other_count / n
  ph_other_pct.append(ph_other_app)
  ph_nan_count = ph_eth_list[0:n].count(nan)
  ph_nan_app = ph_nan_count / n
  ph_nan_pct.append(ph_nan_app)
phi_data['AfricanAmericanPct'] = ph_a_a_pct
phi_data['OtherPct'] = ph_other_pct
phi_data['NonePct'] = ph_nan_pct
phi_data

"""## Graphing the Philly Data"""

#For the most part, the city-level charts have the same basic code structure as the national data.
fig_phi = go.Figure()
fig_phi.add_trace(go.Scatter(
    x=phi_data['DateAdded'],
    y=phi_data['AfricanAmericanPct'],
    name="African American Ethnicity Statement"
))
fig_phi.add_trace(go.Scatter(
    x=phi_data['DateAdded'],
    y=phi_data['OtherPct'],
    name="Non-African American Ethnicity Statement"
))
fig_phi.add_trace(go.Scatter(
    x=phi_data['DateAdded'],
    y=phi_data['NonePct'],
    name="No Ethnicity Statement"
))

#Adding a label for the shape marking the LEADING project period.
fig_phi.add_trace(go.Scatter(
    x=['2021-07-10'],
    y=[0.20],
    text="LEADING Project",
    mode="text",
    showlegend=False))

fig_phi.update_layout(title="Philadelphia Artists",
                       yaxis=dict(tickformat=".1%"))

fig_phi.update_yaxes(title="Percentage of Artists",
                      tick0=0.1,
                      dtick=0.2)

#Adding a shape to highlight the leading project period.
fig_phi.add_shape(type="circle",
                  x0='2021-07-15', x1='2021-12-31',
                  y0=.1, y1=.18,
                  line_color='LightSeaGreen', name='LEADING Project Period')

fig_phi.show()

"""## Repeat the Philly Process for New York City"""

walkin_here = ['New York City']
ny_data = data[data['birthPlaces.value'].isin(walkin_here)]
ny_cols = ['artist.value', 
            'artistLabel.value',
            'sexGenderLabel.value',
            'sexualOrientationLabel.value',
            'ethnicityLabel.value',
            'birthPlaces.value',
            'QNum',
            'DateAdded'
          ]
ny_data = ny_data[ny_cols]
ny_data

ny_eth_list = ny_data['ethnicityLabel.value'].tolist()
ny_a_a_pct = []
ny_other_pct = []
ny_nan_pct = []
n = 0

for item in ny_eth_list:
  n = n + 1
  ny_a_a_count = ny_eth_list[0:n].count(a_a)
  ny_a_a_app = ny_a_a_count / n
  ny_a_a_pct.append(ny_a_a_app)
  ny_other_count = ny_eth_list[0:n].count('Other')
  ny_other_app = ny_other_count / n
  ny_other_pct.append(ny_other_app)
  ny_nan_count = ny_eth_list[0:n].count(nan)
  ny_nan_app = ny_nan_count / n
  ny_nan_pct.append(ny_nan_app)
ny_data['AfricanAmericanPct'] = ny_a_a_pct
ny_data['OtherPct'] = ny_other_pct
ny_data['NonePct'] = ny_nan_pct
ny_data

fig_ny = go.Figure()
fig_ny.add_trace(go.Scatter(
    x=ny_data['DateAdded'],
    y=ny_data['AfricanAmericanPct'],
    name="African American Ethnicity Statement"
))
fig_ny.add_trace(go.Scatter(
    x=ny_data['DateAdded'],
    y=ny_data['OtherPct'],
    name="Non-African American Ethnicity Statement"
))
fig_ny.add_trace(go.Scatter(
    x=ny_data['DateAdded'],
    y=ny_data['NonePct'],
    name="No Ethnicity Statement"
))

fig_ny.update_layout(title="New York City Artists",
                       yaxis=dict(tickformat=".1%"))

fig_ny.update_yaxes(title="Percentage of Artists",
                      tick0=0.1,
                      dtick=0.2)

fig_ny.show()

"""## Detroit"""

vs_everybody = ['Detroit']
det_data = data[data['birthPlaces.value'].isin(vs_everybody)]
det_cols = ['artist.value', 
            'artistLabel.value',
            'sexGenderLabel.value',
            'sexualOrientationLabel.value',
            'ethnicityLabel.value',
            'birthPlaces.value',
            'QNum',
            'DateAdded'
          ]
det_data = det_data[ny_cols]
det_data

det_eth_list = det_data['ethnicityLabel.value'].tolist()
det_a_a_pct = []
det_other_pct = []
det_nan_pct = []
n = 0

for item in det_eth_list:
  n = n + 1
  det_a_a_count = det_eth_list[0:n].count(a_a)
  det_a_a_app = det_a_a_count / n
  det_a_a_pct.append(det_a_a_app)
  det_other_count = det_eth_list[0:n].count('Other')
  det_other_app = det_other_count / n
  det_other_pct.append(det_other_app)
  det_nan_count = det_eth_list[0:n].count(nan)
  det_nan_app = det_nan_count / n
  det_nan_pct.append(det_nan_app)
det_data['AfricanAmericanPct'] = det_a_a_pct
det_data['OtherPct'] = det_other_pct
det_data['NonePct'] = det_nan_pct
det_data

fig_det = go.Figure()
fig_det.add_trace(go.Scatter(
    x=det_data['DateAdded'],
    y=det_data['AfricanAmericanPct'],
    name="African American Ethnicity Statement"
))
fig_det.add_trace(go.Scatter(
    x=det_data['DateAdded'],
    y=det_data['OtherPct'],
    name="Non-African American Ethnicity Statement"
))
fig_det.add_trace(go.Scatter(
    x=det_data['DateAdded'],
    y=det_data['NonePct'],
    name="No Ethnicity Statement"
))

fig_det.update_layout(title="Detroit Artists",
                       yaxis=dict(tickformat=".1%"))

fig_det.update_yaxes(title="Percentage of Artists",
                      tick0=0.1,
                      dtick=0.2)

fig_det.show()

"""## Chicago"""

melort = ['Chicago']
chi_data = data[data['birthPlaces.value'].isin(melort)]
chi_cols = ['artist.value', 
            'artistLabel.value',
            'sexGenderLabel.value',
            'sexualOrientationLabel.value',
            'ethnicityLabel.value',
            'birthPlaces.value',
            'QNum',
            'DateAdded'
          ]
chi_data = chi_data[chi_cols]
chi_data

chi_eth_list = chi_data['ethnicityLabel.value'].tolist()
chi_a_a_pct = []
chi_other_pct = []
chi_nan_pct = []
n = 0

for item in chi_eth_list:
  n = n + 1
  chi_a_a_count = chi_eth_list[0:n].count(a_a)
  chi_a_a_app = chi_a_a_count / n
  chi_a_a_pct.append(chi_a_a_app)
  chi_other_count = chi_eth_list[0:n].count('Other')
  chi_other_app = chi_other_count / n
  chi_other_pct.append(chi_other_app)
  chi_nan_count = chi_eth_list[0:n].count(nan)
  chi_nan_app = chi_nan_count / n
  chi_nan_pct.append(chi_nan_app)
chi_data['AfricanAmericanPct'] = chi_a_a_pct
chi_data['OtherPct'] = chi_other_pct
chi_data['NonePct'] = chi_nan_pct
chi_data

fig_chi = go.Figure()
fig_chi.add_trace(go.Scatter(
    x=chi_data['DateAdded'],
    y=chi_data['AfricanAmericanPct'],
    name="African American Ethnicity Statement"
))
fig_chi.add_trace(go.Scatter(
    x=chi_data['DateAdded'],
    y=chi_data['OtherPct'],
    name="Non-African American Ethnicity Statement"
))
fig_chi.add_trace(go.Scatter(
    x=chi_data['DateAdded'],
    y=chi_data['NonePct'],
    name="No Ethnicity Statement"
))

fig_chi.update_layout(title="Chicago Artists",
                       yaxis=dict(tickformat=".1%"))

fig_chi.update_yaxes(title="Percentage of Artists",
                      tick0=0.1,
                      dtick=0.2)

fig_chi.show()

"""# Create a graph that shows all cities."""

#For the most part, this is just adding a lot of plotly traces from different dataframes to a single graph. For readability, "Other" has simply been excluded here. Ethnicity information has been grouped within the legend, and the national data has been given a greater line weight.
fig_merge = go.Figure()
fig_merge.add_trace(go.Scatter(
    x=data['DateAdded'],
    y=data['AfricanAmericanPct'],
    name="National Data",
    legendgroup="AfricanAmerican",
    legendgrouptitle_text="African American Ethnicity Statement",
    line=dict(width=4)
))

fig_merge.add_trace(go.Scatter(
    x=data['DateAdded'],
    y=data['NonePct'],
    name="National Data",
    legendgroup="None",
    legendgrouptitle_text="No Ethnicity Statement",
    line=dict(width=4)
))

fig_merge.add_trace(go.Scatter(
    x=chi_data['DateAdded'],
    y=chi_data['AfricanAmericanPct'],
    name="Chicago",
    legendgroup="AfricanAmerican"
))

fig_merge.add_trace(go.Scatter(
    x=chi_data['DateAdded'],
    y=chi_data['NonePct'],
    name="Chicago",
    legendgroup="None"
))

fig_merge.add_trace(go.Scatter(
    x=det_data['DateAdded'],
    y=det_data['AfricanAmericanPct'],
    name="Detroit",
    legendgroup="AfricanAmerican"
))

fig_merge.add_trace(go.Scatter(
    x=det_data['DateAdded'],
    y=det_data['NonePct'],
    name="Detroit",
    legendgroup="None"
))

fig_merge.add_trace(go.Scatter(
    x=ny_data['DateAdded'],
    y=ny_data['AfricanAmericanPct'],
    name="New York City",
    legendgroup="AfricanAmerican"
))

fig_merge.add_trace(go.Scatter(
    x=ny_data['DateAdded'],
    y=ny_data['NonePct'],
    name="New York City",
    legendgroup="None"
))

fig_merge.add_trace(go.Scatter(
    x=phi_data['DateAdded'],
    y=phi_data['AfricanAmericanPct'],
    name="Philadelphia",
    legendgroup="AfricanAmerican"
))

fig_merge.add_trace(go.Scatter(
    x=phi_data['DateAdded'],
    y=phi_data['NonePct'],
    name="Philadelphia",
    legendgroup="None"
))

fig_merge.update_layout(title="Artists Across Cities",
                       yaxis=dict(tickformat=".1%"))

fig_merge.update_yaxes(title="Percentage of Artists",
                      tick0=0.1,
                      dtick=0.2)

fig_merge.show()