printAraWEATLists.py

import pickle
import codecs

lang = "ar"
output_file =  codecs.open("WEAT_Arabic_list" ,"w", "utf-8")


def weat_1():
    """
WEAT 1 (universally accepted): Targets 1=Flowers, Targets 2=Insects, Attributes 1=pleasant, Attributes 2=unpleasant
:return: targets_1, targets_2, attributes_1, attributes_2
"""
    targets_1 = ["aster", "clover", "hyacinth", "marigold", "poppy", "azalea", "crocus", "iris", "orchid", "rose",
                 "blue-bell", "daffodil", "lilac", "pansy", "tulip", "buttercup", "daisy", "lily", "peony",
                 "violet",
                 "carnation", "gladiola", "magnolia", "petunia", "zinnia"]
    targets_2 = ["ant", "caterpillar", "flea", "locust", "spider", "bedbug", "centipede", "fly", "maggot",
                 "tarantula",
                 "bee", "cockroach", "gnat", "mosquito", "termite", "beetle", "cricket", "hornet", "moth", "wasp",
                 "blackfly",
                 "dragonfly", "horsefly", "roach", "weevil"]
    attributes_1 = ["caress", "freedom", "health", "love", "peace", "cheer", "friend", "heaven", "loyal",
                    "pleasure",
                    "diamond", "gentle", "honest", "lucky", "rainbow", "diploma", "gift", "honor", "miracle",
                    "sunrise",
                    "family",
                    "happy", "laughter", "paradise", "vacation"]
    attributes_2 = ["abuse", "crash", "filth", "murder", "sickness", "accident", "death", "grief", "poison",
                    "stink",
                    "assault", "disaster", "hatred", "pollute", "tragedy", "divorce", "jail", "poverty", "ugly",
                    "cancer",
                    "kill", "rotten",
                    "vomit", "agony", "prison"]
    return targets_1, targets_2, attributes_1, attributes_2


# WEAT 2: Target 1=Musical instruments, Target 2= Weapons, Attributes 1: pleasant, Attributes 2=unpleasant
def weat_2():
    targets_1 = ["bagpipe", "cello", "guitar", "lute", "trombone", "banjo", "clarinet", "harmonica", "mandolin",
                 "trumpet", "bassoon", "drum", "harp", "oboe", "tuba", "bell", "fiddle", "harpsichord", "piano",
                 "viola", "bongo",
                 "flute", "horn", "saxophone", "violin"]
    targets_2 = ["arrow", "club", "gun", "missile", "spear", "axe", "dagger", "harpoon", "pistol", "sword", "blade",
                 "dynamite", "hatchet", "rifle", "tank", "bomb", "firearm", "knife", "shotgun", "teargas", "cannon",
                 "grenade",
                 "mace", "slingshot", "whip"]
    attributes_1 = ["caress", "freedom", "health", "love", "peace", "cheer", "friend", "heaven", "loyal",
                    "pleasure",
                    "diamond", "gentle", "honest", "lucky", "rainbow", "diploma", "gift", "honor", "miracle",
                    "sunrise",
                    "family", "happy", "laughter", "paradise", "vacation"]
    attributes_2 = ["abuse", "crash", "filth", "murder", "sickness", "accident", "death", "grief", "poison",
                    "stink",
                    "assault", "disaster", "hatred", "pollute", "tragedy", "divorce", "jail", "poverty", "ugly",
                    "cancer",
                    "kill", "rotten",
                    "vomit", "agony", "prison"]

    return targets_1, targets_2, attributes_1, attributes_2


# Here they deleted the infrequent african american names, and the same number randomly choosen from the european american names
def weat_3():
    # excluded in the original paper: Chip, Ian, Fred, Jed, Todd, Brandon, Wilbur, Sara, Amber, Crystal, Meredith, Shannon, Donna,
    # Bobbie-Sue, Peggy, Sue-Ellen, Wendy
    targets_1 = ["Adam", "Harry", "Josh", "Roger", "Alan", "Frank", "Justin", "Ryan", "Andrew", "Jack", "Matthew",
                 "Stephen",
                 "Brad", "Greg", "Paul", "Hank", "Jonathan", "Peter", "Amanda", "Courtney", "Heather", "Melanie",
                 "Katie", "Betsy", "Kristin", "Nancy", "Stephanie", "Ellen", "Lauren", "Colleen", "Emily", "Megan",
                 "Rachel",
                 "Chip", "Ian", "Fred", "Jed", "Todd", "Brandon", "Wilbur", "Sara", "Amber", "Crystal", "Meredith",
                 "Shannon",
                 "Donna", "Bobbie-Sue", "Peggy", "Sue-Ellen", "Wendy"]

    # excluded: Lerone, Percell, Rasaan, Rashaun, Everol, Terryl, Aiesha, Lashelle, Temeka, Tameisha, Teretha, Latonya, Shanise,
    # Sharise, Tashika, Lashandra, Shavonn, Tawanda,
    targets_2 = ["Alonzo", "Jamel", "Theo", "Alphonse", "Jerome", "Leroy", "Torrance", "Darnell", "Lamar", "Lionel",
                 "Tyree", "Deion", "Lamont", "Malik", "Terrence", "Tyrone", "Lavon", "Marcellus", "Wardell",
                 "Nichelle",
                 "Shereen", "Ebony", "Latisha", "Shaniqua", "Jasmine", "Tanisha", "Tia", "Lakisha", "Latoya",
                 "Yolanda",
                 "Malika", "Yvette", "Lerone", "Percell", "Rasaan", "Rashaun", "Everol", "Terryl", "Aiesha",
                 "Lashelle",
                 "Temeka", "Tameisha", "Teretha", "Latonya", "Shanise", "Sharise", "Tashika", "Lashandra",
                 "Shavonn", "Tawanda"]

    attributes_1 = ["caress", "freedom", "health", "love", "peace", "cheer", "friend", "heaven", "loyal",
                    "pleasure",
                    "diamond", "gentle", "honest", "lucky", "rainbow", "diploma", "gift", "honor", "miracle",
                    "sunrise",
                    "family", "happy", "laughter", "paradise", "vacation"]

    # same as before but agony and prison are replaced with bomb and evil
    attributes_2 = ["abuse", "crash", "filth", "murder", "sickness", "accident", "death", "grief", "poison",
                    "stink",
                    "assault", "disaster", "hatred", "pollute", "tragedy", "divorce", "jail", "poverty", "ugly",
                    "cancer",
                    "kill", "rotten",
                    "vomit", "bomb", "evil"]

    return targets_1, targets_2, attributes_1, attributes_2


# again: african american names vs. european american names and pleasant vs unpleasant but with different names
def weat_4():
    # excluded as in the original paper: Jay, Kristen, (here only excluded in the glove experiments)
    targets_1 = ["Brad", "Brendan", "Geoffrey", "Greg", "Brett", "Matthew", "Neil", "Todd", "Allison", "Anne",
                 "Carrie", "Emily", "Jill", "Laurie", "Meredith", "Sarah", "Jay", "Kristen"]

    # excluded in GloVe experiments:  Tremayne, Latonya,
    targets_2 = ["Darnell", "Hakim", "Jermaine", "Kareem", "Jamal", "Leroy", "Rasheed", "Tyrone", "Aisha", "Ebony",
                 "Keisha", "Kenya", "Lakisha", "Latoya", "Tamika", "Tanisha", "Tremayne", "Latonya"]

    attributes_1 = ["caress", "freedom", "health", "love", "peace", "cheer", "friend", "heaven", "loyal",
                    "pleasure",
                    "diamond", "gentle", "honest", "lucky", "rainbow", "diploma", "gift", "honor", "miracle",
                    "sunrise",
                    "family", "happy", "laughter", "paradise", "vacation"]

    # same as before but agony and prison are replaced with bomb and evil
    attributes_2 = ["abuse", "crash", "filth", "murder", "sickness", "accident", "death", "grief", "poison",
                    "stink",
                    "assault", "disaster", "hatred", "pollute", "tragedy", "divorce", "jail", "poverty", "ugly",
                    "cancer",
                    "kill", "rotten",
                    "vomit", "bomb", "evil"]

    return targets_1, targets_2, attributes_1, attributes_2


# again african american vs european american names, but with different attributes
def weat_5():
    # excluded as in the original paper: Jay, Kristen, (here only excluded in the glove experiments)
    targets_1 = ["Brad", "Brendan", "Geoffrey", "Greg", "Brett", "Matthew", "Neil", "Todd", "Allison", "Anne",
                 "Carrie", "Emily", "Jill", "Laurie", "Meredith", "Sarah", "Jay", "Kristen"]

    # excluded in GloVe experiments:  Tremayne, Latonya,
    targets_2 = ["Darnell", "Hakim", "Jermaine", "Kareem", "Jamal", "Leroy", "Rasheed", "Tyrone", "Aisha", "Ebony",
                 "Keisha", "Kenya", "Lakisha", "Latoya", "Tamika", "Tanisha", "Tremayne", "Latonya"]

    attributes_1 = ["joy", "love", "peace", "wonderful", "pleasure", "friend", "laughter", "happy"]

    attributes_2 = ["agony", "terrible", "horrible", "nasty", "evil", "war", "awful", "failure"]

    return targets_1, targets_2, attributes_1, attributes_2


# sex- career
def weat_6():
    targets_1 = ["John", "Paul", "Mike", "Kevin", "Steve", "Greg", "Jeff", "Bill"]
    targets_2 = ["Amy", "Joan", "Lisa", "Sarah", "Diana", "Kate", "Ann", "Donna"]
    attributes_1 = ["executive", "management", "professional", "corporation", "salary", "office", "business",
                    "career"]
    attributes_2 = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]

    return targets_1, targets_2, attributes_1, attributes_2


# sex - math
def weat_7():
    targets_1 = ["math", "algebra", "geometry", "calculus", "equations", "computation", "numbers", "addition"]
    targets_2 = ["poetry", "art", "dance", "literature", "novel", "symphony", "drama", "sculpture"]
    attributes_1 = ["male", "man", "boy", "brother", "he", "him", "his", "son"]
    attributes_2 = ["female", "woman", "girl", "sister", "she", "her", "hers", "daughter"]
    return targets_1, targets_2, attributes_1, attributes_2


# sex - science
def weat_8():
    targets_1 = ["science", "technology", "physics", "chemistry", "Einstein", "NASA", "experiment", "astronomy"]
    targets_2 = ["poetry", "art", "Shakespeare", "dance", "literature", "novel", "symphony", "drama"]
    attributes_1 = ["brother", "father", "uncle", "grandfather", "son", "he", "his", "him"]
    attributes_2 = ["sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"]
    return targets_1, targets_2, attributes_1, attributes_2


# desease - uncontrollability
def weat_9():
    targets_1 = ["sad", "hopeless", "gloomy", "tearful", "miserable", "depressed"]
    targets_2 = ["sick", "illness", "influenza", "disease", "virus", "cancer"]
    # in w2v experiments, the've replaced short-term with short
    attributes_1 = ["impermanent", "unstable", "variable", "fleeting", "short-term", "brief", "occasional"]
    attributes_2 = ["stable", "always", "constant", "persistent", "chronic", "prolonged", "forever"]

    return targets_1, targets_2, attributes_1, attributes_2


# old - pleasant
def weat_10():
    targets_1 = ["Tiffany", "Michelle", "Cindy", "Kristy", "Brad", "Eric", "Joey", "Billy"]
    targets_2 = ["Ethel", "Bernice", "Gertrude", "Agnes", "Cecil", "Wilbert", "Mortimer", "Edgar"]
    attributes_1 = ["joy", "love", "peace", "wonderful", "pleasure", "friend", "laughter", "happy"]
    attributes_2 = ["agony", "terrible", "horrible", "nasty", "evil", "war", "awful", "failure"]

    return targets_1, targets_2, attributes_1, attributes_2

def load_vocab_goran(path):  #load pickle files
    return pickle.load(open(path, "rb"))
def translate(translation_dict, terms):
    translation = []
    for t in terms:
        if t in translation_dict or t.lower() in translation_dict:
            if t.lower() in translation_dict:
                male, female = translation_dict[t.lower()]
            elif t in translation_dict:
                male, female = translation_dict[t]
            if female is None or female is '':
                translation.append(male)
            else:
                translation.append(male)
                translation.append(female)
        else:
            translation.append(t)
    translation = list(set(translation))
    return translation

def ommit(opened_file,  targets_1, targets_2, attributes_1, attributes_2,test_number ):
    opened_file.writelines("Test Number: " + str(test_number) + "\n")
    opened_file.write("TI"+ "\t")
    for i in targets_1:
        opened_file.write(i +"\t")
    opened_file.write("\n")
    opened_file.write("T2" + "\t")
    for i in targets_2:
        opened_file.write(i + "\t")
    opened_file.write("\n")
    opened_file.write("A1" + "\t")
    for i in attributes_1:
        opened_file.write(i + "\t")
    opened_file.write("\n")
    opened_file.write("A2" + "\t")
    for i in attributes_2:
        opened_file.write(i + "\t")
    opened_file.write("\n")

for test_number in range(1,11):
    if test_number == 1:
        targets_1, targets_2, attributes_1, attributes_2 = weat_1()
    elif test_number == 2:
        targets_1, targets_2, attributes_1, attributes_2 = weat_2()
    elif test_number == 3:
        targets_1, targets_2, attributes_1, attributes_2 = weat_3()
    elif test_number == 4:
        targets_1, targets_2, attributes_1, attributes_2 = weat_4()
    elif test_number == 5:
        targets_1, targets_2, attributes_1, attributes_2 = weat_5()
    elif test_number == 6:
        targets_1, targets_2, attributes_1, attributes_2 = weat_6()
    elif test_number == 7:
        targets_1, targets_2, attributes_1, attributes_2 = weat_7()
    elif test_number == 8:
        targets_1, targets_2, attributes_1, attributes_2 = weat_8()
    elif test_number == 9:
        targets_1, targets_2, attributes_1, attributes_2 = weat_9()
    elif test_number == 10:
        targets_1, targets_2, attributes_1, attributes_2 = weat_10()
    else:
        raise ValueError("Only WEAT 1 to 10 are supported")

    translation_dict = load_vocab_goran("data/vocab_dict_en_arNoClean.p")
    targets_1 = translate(translation_dict, targets_1)
    targets_2 = translate(translation_dict, targets_2)
    attributes_1 = translate(translation_dict, attributes_1)
    attributes_2 = translate(translation_dict, attributes_2)
    ommit(output_file,  targets_1, targets_2, attributes_1, attributes_2, test_number)

output_file.close()