From b479c76a91119363e7baeb01b5a13187a01ea222 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Mon, 13 Mar 2017 12:59:19 -0400 Subject: [PATCH] renaming nearest_to to closest_to for back-compatibility --- NAMESPACE | 1 + NEWS.md | 13 +++-- R/matrixFunctions.R | 72 ++++++++++++++++----------- README.md | 10 ++-- inst/doc/exploration.R | 26 +++++----- inst/doc/exploration.Rmd | 26 +++++----- inst/doc/exploration.html | 26 +++++----- inst/doc/introduction.R | 8 +-- inst/doc/introduction.Rmd | 8 +-- inst/doc/introduction.html | 8 +-- man/closest_to.Rd | 61 +++++++++++++++++++++++ man/cosineSimilarity.Rd | 4 +- man/distend.Rd | 4 +- man/improve_vectorspace.Rd | 4 +- man/nearest_to.Rd | 56 ++++----------------- man/reject.Rd | 4 +- tests/testthat/test-name-collapsing.r | 26 +++++----- vignettes/exploration.Rmd | 26 +++++----- vignettes/introduction.Rmd | 8 +-- 19 files changed, 218 insertions(+), 173 deletions(-) create mode 100644 man/closest_to.Rd diff --git a/NAMESPACE b/NAMESPACE index a93500e..4b47db1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export("%>%") export(as.VectorSpaceModel) +export(closest_to) export(cosineDist) export(cosineSimilarity) export(distend) diff --git a/NEWS.md b/NEWS.md index 2e752b9..ac03212 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,12 +1,15 @@ # VERSION 2.0 -Upgrade focusing on ease of use and CRAN-ability. Bumping major version because of a breaking change in the behavior of `nearest_to`, which now returns a data.frame. +Upgrade focusing on ease of use and CRAN-ability. Bumping major version because of a breaking change in the behavior of `closest_to`, which now returns a data.frame. # Changes -## Change in nearest_to behavior. +## New default function: closest_to. -There's a change in `nearest_to` that will break some existing code. Now it returns a data.frame instead of a list. The data.frame columns have elaborate names so they can easily be manipulated with dplyr, and/or plotted with ggplot. There are flags to return to the old behavior (`as_df=FALSE`). +`nearest_to` was previously the easiest way to interact with cosine similarity functions. That's been deprecated +in favor of a new function, `closest_to`. (I would have changed the name but for back-compatibility reasons.) +The data.frame columns have elaborate names so they can easily be manipulated with dplyr, and/or plotted with ggplot. +`nearest_to` is now just a wrapped version of the new function. ## New syntax for vector addition. @@ -14,13 +17,13 @@ This package now allows formula scoping for the most common operations, and stri For instance, instead of writing ```R -vectors %>% nearest_to(vectors[rownames(vectors)=="king",] - vectors[rownames(vectors)=="man",] + vectors[rownames(vectors)=="woman",]) +vectors %>% closest_to(vectors[rownames(vectors)=="king",] - vectors[rownames(vectors)=="man",] + vectors[rownames(vectors)=="woman",]) ``` (whew!), you can write ```R -vectors %>% nearest_to(~"king" - "man" + "woman") +vectors %>% closest_to(~"king" - "man" + "woman") ``` diff --git a/R/matrixFunctions.R b/R/matrixFunctions.R index 8668bfd..faa6836 100644 --- a/R/matrixFunctions.R +++ b/R/matrixFunctions.R @@ -12,11 +12,11 @@ #' #' @examples #' -#' nearest_to(demo_vectors,"great") +#' closest_to(demo_vectors,"great") #' # stopwords like "and" and "very" are no longer top ten. #' # I don't know if this is really better, though. #' -#' nearest_to(improve_vectorspace(demo_vectors),"great") +#' closest_to(improve_vectorspace(demo_vectors),"great") #' improve_vectorspace = function(vectorspace,D=round(ncol(vectorspace)/100)) { mean = methods::new("VectorSpaceModel", @@ -531,9 +531,9 @@ filter_to_rownames <- function(matrix,words) { #' subjects = demo_vectors[[c("history","literature","biology","math","stats"),average=FALSE]] #' similarities = cosineSimilarity(subjects,subjects) #' -#' # Use 'nearest_to' to build up a large list of similar words to a seed set. +#' # Use 'closest_to' to build up a large list of similar words to a seed set. #' subjects = demo_vectors[[c("history","literature","biology","math","stats"),average=TRUE]] -#' new_subject_list = nearest_to(demo_vectors,subjects,20) +#' new_subject_list = closest_to(demo_vectors,subjects,20) #' new_subjects = demo_vectors[[new_subject_list$word,average=FALSE]] #' #' # Plot the cosineDistance of these as a dendrogram. @@ -637,10 +637,10 @@ project = function(matrix,vector) { #' See `project` for more details. #' #' @examples -#' nearest_to(demo_vectors,demo_vectors[["man"]]) +#' closest_to(demo_vectors,demo_vectors[["man"]]) #' #' genderless = reject(demo_vectors,demo_vectors[["he"]] - demo_vectors[["she"]]) -#' nearest_to(genderless,genderless[["man"]]) +#' closest_to(genderless,genderless[["man"]]) #' #' @export reject = function(matrix,vector) { @@ -673,12 +673,12 @@ reject = function(matrix,vector) { #' See `project` for more details and usage. #' #' @examples -#' nearest_to(demo_vectors,"sweet") +#' closest_to(demo_vectors,"sweet") #' #' # Stretch out the vectorspace 4x longer along the gender direction. #' more_sexist = distend(demo_vectors, ~ "man" + "he" - "she" -"woman", 4) #' -#' nearest_to(more_sexist,"sweet") +#' closest_to(more_sexist,"sweet") #' #' @export distend = function(matrix,vector, multiplier) { @@ -692,7 +692,6 @@ distend = function(matrix,vector, multiplier) { #' @param vector A vector (or a string or a formula coercable to a vector) #' of the same length as the VectorSpaceModel. See below. #' @param n The number of closest words to include. -#' @param as_df Return as a data.frame? If false, returns a named vector, for back-compatibility. #' @param fancy_names If true (the default) the data frame will have descriptive names like #' 'similarity to "king+queen-man"'; otherwise, just 'similarity.' The default can speed up #' interactive exploration. @@ -704,8 +703,8 @@ distend = function(matrix,vector, multiplier) { #' 'cosineSimilarity'; the listing of several words similar to a given vector. #' Unlike cosineSimilarity, it returns a data.frame object instead of a matrix. #' cosineSimilarity is more powerful, because it can compare two matrices to -#' each other; nearest_to can only take a vector or vectorlike object as its second argument. -#' But with (or without) the argument n=Inf, nearest_to is often better for +#' each other; closest_to can only take a vector or vectorlike object as its second argument. +#' But with (or without) the argument n=Inf, closest_to is often better for #' plugging directly into a plot. #' #' As with cosineSimilarity, the second argument can take several forms. If it's a vector or @@ -717,26 +716,26 @@ distend = function(matrix,vector, multiplier) { #' @examples #' #' # Synonyms and similar words -#' nearest_to(demo_vectors,demo_vectors[["good"]]) +#' closest_to(demo_vectors,demo_vectors[["good"]]) #' #' # If 'matrix' is a VectorSpaceModel object, #' # you can also just enter a string directly, and #' # it will be evaluated in the context of the passed matrix. #' -#' nearest_to(demo_vectors,"good") +#' closest_to(demo_vectors,"good") #' #' # You can also express more complicated formulas. #' -#' nearest_to(demo_vectors,"good") +#' closest_to(demo_vectors,"good") #' #' # Something close to the classic king:man::queen:woman; #' # What's the equivalent word for a female teacher that "guy" is for #' # a male one? #' -#' nearest_to(demo_vectors,~ "guy" - "man" + "woman") +#' closest_to(demo_vectors,~ "guy" - "man" + "woman") #' #' @export -nearest_to = function(matrix, vector, n=10, as_df = TRUE, fancy_names = TRUE) { +closest_to = function(matrix, vector, n=10, fancy_names = TRUE) { label = deparse(substitute(vector),width.cutoff=500) if (substr(label,1,1)=="~") {label = substr(label,2,500)} @@ -749,20 +748,37 @@ nearest_to = function(matrix, vector, n=10, as_df = TRUE, fancy_names = TRUE) { # For sorting. ords = order(-sims[,1]) - if (!as_df) { - structure( - 1-sims[ords[1:n]], # Convert from similarity to distance. - names=rownames(sims)[ords[1:n]]) + return_val = data.frame(rownames(sims)[ords[1:n]], sims[ords[1:n]],stringsAsFactors=FALSE) + if (fancy_names) { + names(return_val) = c("word", paste("similarity to", label)) } else { - return_val = data.frame(rownames(sims)[ords[1:n]], sims[ords[1:n]],stringsAsFactors=FALSE) - if (fancy_names) { - names(return_val) = c("word", paste("similarity to", label)) - } else { - names(return_val) = c("word","similarity") - } - rownames(return_val) = NULL - return_val + names(return_val) = c("word","similarity") } + rownames(return_val) = NULL + return_val } +#' Nearest vectors to a word +#' +#' @description This a wrapper around closest_to, included for back-compatibility. Use +#' closest_to for new applications. +#' @param ... See `closest_to` +#' +#' @return a names vector of cosine similarities. See 'nearest_to' for more details. +#' @export +#' +#' @examples +#' +#' # Recommended usage in 1.0: +#' nearest_to(demo_vectors, demo_vectors[["good"]]) +#' +#' # Recommended usage in 2.0: +#' demo_vectors %>% closest_to("good") +#' +nearest_to = function(...) { + vals = closest_to(...,fancy_names = F) + returnable = 1 - vals$similarity + names(returnable) = vals$word + returnable +} diff --git a/README.md b/README.md index 3bdd619..d146275 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,8 @@ An R package for building and exploring word embedding models. This package does three major things to make it easier to work with word2vec and other vectorspace models of language. 1. [Trains word2vec models](#creating-text-vectors) using an extended Jian Li's word2vec code; reads and writes the binary word2vec format so that you can import pre-trained models such as Google's; and provides tools for reading only *part* of a model (rows or columns) so you can explore a model in memory-limited situations. -2. [Creates a new `VectorSpaceModel` class in R that gives a better syntax for exploring a word2vec or GloVe model than native matrix methods.](#vectorspacemodel-object) For example, instead of writing `model[rownames(model)=="king",]`, you can write `model[["king"]]`, and instead of writing `vectors %>% nearest_to(vectors[rownames(vectors)=="king",] - vectors[rownames(vectors)=="man",] + vectors[rownames(vectors)=="woman",])` (whew!), you can write -`vectors %>% nearest_to(~"king" - "man" + "woman")`. +2. [Creates a new `VectorSpaceModel` class in R that gives a better syntax for exploring a word2vec or GloVe model than native matrix methods.](#vectorspacemodel-object) For example, instead of writing `model[rownames(model)=="king",]`, you can write `model[["king"]]`, and instead of writing `vectors %>% closest_to(vectors[rownames(vectors)=="king",] - vectors[rownames(vectors)=="man",] + vectors[rownames(vectors)=="woman",])` (whew!), you can write +`vectors %>% closest_to(~"king" - "man" + "woman")`. 3. [Implements several basic matrix operations that are useful in exploring word embedding models including cosine similarity, nearest neighbor, and vector projection](#useful-matrix-operations) with some caching that makes them much faster than the simplest implementations. ### Quick start @@ -85,7 +85,7 @@ Each takes a `VectorSpaceModel` as its first argument. Sometimes, it's appropria * `cosineSimilarity(VSM_1,VSM_2)` calculates the cosine similarity of every vector in on vector space model to every vector in another. This is `n^2` complexity. With a vocabulary size of 20,000 or so, it can be reasonable to compare an entire set to itself; or you can compare a larger set to a smaller one to search for particular terms of interest. * `cosineDistance(VSM_1,VSM_2)` is the inverse of cosineSimilarity. It's not really a distance metric, but can be used as one for clustering and the like. - * `nearest_to(VSM,vector,n)` wraps a particularly common use case for `cosineSimilarity`, of finding the top `n` terms in a `VectorSpaceModel` closest to term m + * `closest_to(VSM,vector,n)` wraps a particularly common use case for `cosineSimilarity`, of finding the top `n` terms in a `VectorSpaceModel` closest to term m * `project(VSM,vector)` takes a `VectorSpaceModel` and returns the portion parallel to the vector `vector`. * `reject(VSM,vector)` is the inverse of `project`; it takes a `VectorSpaceModel` and returns the portion orthogonal to the vector `vector`. This makes it possible, for example, to collapse a vector space by removing certain distinctions of meaning. * `magnitudes` calculated the magnitude of each element in a VSM. This is useful in many operations. @@ -98,7 +98,7 @@ not_that_kind_of_bank = chronam_vectors[["bank"]] %>% reject(chronam_vectors[["cashier"]]) %>% reject(chronam_vectors[["depositors"]]) %>% reject(chronam_vectors[["check"]]) -chronam_vectors %>% nearest_to(not_that_kind_of_bank) +chronam_vectors %>% closest_to(not_that_kind_of_bank) ``` These functions also allow an additional layer of syntactic sugar when working with word vectors. @@ -106,7 +106,7 @@ These functions also allow an additional layer of syntactic sugar when working w Or even just as a formula, if you're working entirely with a single model, so you don't have to keep referring to words; instead, you can use a formula interface to reduce typing and increase clarity. ```{r} -vectors %>% nearest_to(~ "king" - "man" + "woman") +vectors %>% closest_to(~ "king" - "man" + "woman") ``` diff --git a/inst/doc/exploration.R b/inst/doc/exploration.R index d6acf4a..07b42b9 100644 --- a/inst/doc/exploration.R +++ b/inst/doc/exploration.R @@ -6,33 +6,33 @@ library(magrittr) demo_vectors[["good"]] ## ------------------------------------------------------------------------ -demo_vectors %>% nearest_to(demo_vectors[["good"]]) +demo_vectors %>% closest_to(demo_vectors[["good"]]) ## ------------------------------------------------------------------------ -demo_vectors %>% nearest_to("bad") +demo_vectors %>% closest_to("bad") ## ------------------------------------------------------------------------ -demo_vectors %>% nearest_to(~"good"+"bad") +demo_vectors %>% closest_to(~"good"+"bad") # The same thing could be written as: -# demo_vectors %>% nearest_to(demo_vectors[["good"]]+demo_vectors[["bad"]]) +# demo_vectors %>% closest_to(demo_vectors[["good"]]+demo_vectors[["bad"]]) ## ------------------------------------------------------------------------ -demo_vectors %>% nearest_to(~"good" - "bad") +demo_vectors %>% closest_to(~"good" - "bad") ## ------------------------------------------------------------------------ -demo_vectors %>% nearest_to(~ "bad" - "good") +demo_vectors %>% closest_to(~ "bad" - "good") ## ------------------------------------------------------------------------ -demo_vectors %>% nearest_to(~ "he" - "she") -demo_vectors %>% nearest_to(~ "she" - "he") +demo_vectors %>% closest_to(~ "he" - "she") +demo_vectors %>% closest_to(~ "she" - "he") ## ------------------------------------------------------------------------ -demo_vectors %>% nearest_to(~ "guy" - "he" + "she") +demo_vectors %>% closest_to(~ "guy" - "he" + "she") ## ------------------------------------------------------------------------ -demo_vectors %>% nearest_to(~ "guy" + ("she" - "he")) +demo_vectors %>% closest_to(~ "guy" + ("she" - "he")) ## ------------------------------------------------------------------------ @@ -42,13 +42,13 @@ demo_vectors[[c("lady","woman","man","he","she","guy","man"), average=F]] %>% ## ------------------------------------------------------------------------ top_evaluative_words = demo_vectors %>% - nearest_to(~ "good"+"bad",n=75) + closest_to(~ "good"+"bad",n=75) goodness = demo_vectors %>% - nearest_to(~ "good"-"bad",n=Inf) + closest_to(~ "good"-"bad",n=Inf) femininity = demo_vectors %>% - nearest_to(~ "she" - "he", n=Inf) + closest_to(~ "she" - "he", n=Inf) ## ------------------------------------------------------------------------ library(ggplot2) diff --git a/inst/doc/exploration.Rmd b/inst/doc/exploration.Rmd index 663f2ae..fd056b8 100644 --- a/inst/doc/exploration.Rmd +++ b/inst/doc/exploration.Rmd @@ -48,7 +48,7 @@ demo_vectors[["good"]] These numbers are meaningless on their own. But in the vector space, we can find similar words. ```{r} -demo_vectors %>% nearest_to(demo_vectors[["good"]]) +demo_vectors %>% closest_to(demo_vectors[["good"]]) ``` The `%>%` is the pipe operator from magrittr; it helps to keep things organized, and is particularly useful with some of the things we'll see later. The 'similarity' scores here are cosine similarity in a vector space; 1.0 represents perfect similarity, 0 is no correlation, and -1.0 is complete opposition. In practice, vector "opposition" is different from the colloquial use of "opposite," and very rare. You'll only occasionally see vector scores below 0--as you can see above, "bad" is actually one of the most similar words to "good." @@ -56,7 +56,7 @@ The `%>%` is the pipe operator from magrittr; it helps to keep things organized, When interactively exploring a single model (rather than comparing *two* models), it can be a pain to keep retyping words over and over. Rather than operate on the vectors, this package also lets you access the word directly by using R's formula notation: putting a tilde in front of it. For a single word, you can even access it directly, as so. ```{r} -demo_vectors %>% nearest_to("bad") +demo_vectors %>% closest_to("bad") ``` ## Vector math @@ -65,16 +65,16 @@ The tildes are necessary syntax where things get interesting--you can do **math* ```{r} -demo_vectors %>% nearest_to(~"good"+"bad") +demo_vectors %>% closest_to(~"good"+"bad") # The same thing could be written as: -# demo_vectors %>% nearest_to(demo_vectors[["good"]]+demo_vectors[["bad"]]) +# demo_vectors %>% closest_to(demo_vectors[["good"]]+demo_vectors[["bad"]]) ``` Those are words that are common to both "good" and "bad". We could also find words that are shaded towards just good but *not* bad by using subtraction. ```{r} -demo_vectors %>% nearest_to(~"good" - "bad") +demo_vectors %>% closest_to(~"good" - "bad") ``` > What does this "subtraction" vector mean? @@ -88,14 +88,14 @@ demo_vectors %>% nearest_to(~"good" - "bad") Again, you can easily switch the order to the opposite: here are a bunch of bad words: ```{r} -demo_vectors %>% nearest_to(~ "bad" - "good") +demo_vectors %>% closest_to(~ "bad" - "good") ``` All sorts of binaries are captured in word2vec models. One of the most famous, since Mikolov's original word2vec paper, is *gender*. If you ask for similarity to "he"-"she", for example, you get words that appear mostly in a *male* context. Since these examples are from teaching evaluations, after just a few straightforwardly gendered words, we start to get things that only men are ("arrogant") or where there are very few women in the university ("physics") ```{r} -demo_vectors %>% nearest_to(~ "he" - "she") -demo_vectors %>% nearest_to(~ "she" - "he") +demo_vectors %>% closest_to(~ "he" - "she") +demo_vectors %>% closest_to(~ "she" - "he") ``` ## Analogies @@ -112,7 +112,7 @@ removing its similarity to "he", and additing a similarity to "she". This yields the answer: the most similar term to "guy" for a woman is "lady." ```{r} -demo_vectors %>% nearest_to(~ "guy" - "he" + "she") +demo_vectors %>% closest_to(~ "guy" - "he" + "she") ``` If you're using the other mental framework, of thinking of these as real vectors, @@ -122,7 +122,7 @@ to femininity. You can then add this vector to "guy", and that will take you to only the grouping is different. ```{r} -demo_vectors %>% nearest_to(~ "guy" + ("she" - "he")) +demo_vectors %>% closest_to(~ "guy" + ("she" - "he")) ``` Principal components can let you plot a subset of these vectors to see how they relate. You can imagine an arrow from "he" to "she", from "guy" to "lady", and from "man" to "woman"; all run in roughly the same direction. @@ -140,13 +140,13 @@ First we build up three data_frames: first, a list of the 50 top evaluative word ```{r} top_evaluative_words = demo_vectors %>% - nearest_to(~ "good"+"bad",n=75) + closest_to(~ "good"+"bad",n=75) goodness = demo_vectors %>% - nearest_to(~ "good"-"bad",n=Inf) + closest_to(~ "good"-"bad",n=Inf) femininity = demo_vectors %>% - nearest_to(~ "she" - "he", n=Inf) + closest_to(~ "she" - "he", n=Inf) ``` Then we can use tidyverse packages to join and plot these. diff --git a/inst/doc/exploration.html b/inst/doc/exploration.html index d0a29d9..eda1e1d 100644 --- a/inst/doc/exploration.html +++ b/inst/doc/exploration.html @@ -102,7 +102,7 @@

Getting started.

## attr(,".cache") ## <environment: 0xae47980>

These numbers are meaningless on their own. But in the vector space, we can find similar words.

-
demo_vectors %>% nearest_to(demo_vectors[["good"]])
+
demo_vectors %>% closest_to(demo_vectors[["good"]])
##         word similarity to demo_vectors[["good"]]
 ## 1       good                            1.0000000
 ## 2      great                            0.7089031
@@ -116,7 +116,7 @@ 

Getting started.

## 10 a 0.4893531

The %>% is the pipe operator from magrittr; it helps to keep things organized, and is particularly useful with some of the things we’ll see later. The ‘similarity’ scores here are cosine similarity in a vector space; 1.0 represents perfect similarity, 0 is no correlation, and -1.0 is complete opposition. In practice, vector “opposition” is different from the colloquial use of “opposite,” and very rare. You’ll only occasionally see vector scores below 0–as you can see above, “bad” is actually one of the most similar words to “good.”

When interactively exploring a single model (rather than comparing two models), it can be a pain to keep retyping words over and over. Rather than operate on the vectors, this package also lets you access the word directly by using R’s formula notation: putting a tilde in front of it. For a single word, you can even access it directly, as so.

-
demo_vectors %>% nearest_to("bad")
+
demo_vectors %>% closest_to("bad")
##         word similarity to "bad"
 ## 1        bad           1.0000000
 ## 2       good           0.5263100
@@ -132,7 +132,7 @@ 

Getting started.

Vector math

The tildes are necessary syntax where things get interesting–you can do math on these vectors. So if we want to find the words that are closest to the combination of “good” and “bad” (which is to say, words that get used in evaluation) we can write (see where the tilde is?):

-
demo_vectors %>% nearest_to(~"good"+"bad")
+
demo_vectors %>% closest_to(~"good"+"bad")
##      word similarity to "good" + "bad"
 ## 1     bad                    0.8845830
 ## 2    good                    0.8621269
@@ -145,9 +145,9 @@ 

Vector math

## 9 ok 0.4751181 ## 10 that 0.4692515
# The same thing could be written as:
-# demo_vectors %>% nearest_to(demo_vectors[["good"]]+demo_vectors[["bad"]])
+# demo_vectors %>% closest_to(demo_vectors[["good"]]+demo_vectors[["bad"]])

Those are words that are common to both “good” and “bad”. We could also find words that are shaded towards just good but not bad by using subtraction.

-
demo_vectors %>% nearest_to(~"good" - "bad")
+
demo_vectors %>% closest_to(~"good" - "bad")
##         word similarity to "good" - "bad"
 ## 1       good                    0.4205466
 ## 2      great                    0.3328308
@@ -163,7 +163,7 @@ 

Vector math

What does this “subtraction” vector mean? In practice, the easiest way to think of it is probably simply as ‘similar to good and dissimilar to ’bad’. Omer and Levy’s papers suggest this interpretation. But taking the vectors more seriously means you can think of it geometrically: “good”-“bad” is a vector that describes the difference between positive and negative. Similarity to this vector means, technically, the portion of a words vectors whose whose multidimensional path lies largely along the direction between the two words.

Again, you can easily switch the order to the opposite: here are a bunch of bad words:

-
demo_vectors %>% nearest_to(~ "bad" - "good")
+
demo_vectors %>% closest_to(~ "bad" - "good")
##        word similarity to "bad" - "good"
 ## 1       bad                    0.5501080
 ## 2    either                    0.2372618
@@ -176,7 +176,7 @@ 

Vector math

## 9 dumb 0.1455157 ## 10 unfair 0.1449083

All sorts of binaries are captured in word2vec models. One of the most famous, since Mikolov’s original word2vec paper, is gender. If you ask for similarity to “he”-“she”, for example, you get words that appear mostly in a male context. Since these examples are from teaching evaluations, after just a few straightforwardly gendered words, we start to get things that only men are (“arrogant”) or where there are very few women in the university (“physics”)

-
demo_vectors %>% nearest_to(~ "he" - "she")
+
demo_vectors %>% closest_to(~ "he" - "she")
##        word similarity to "he" - "she"
 ## 1        he                  0.5014923
 ## 2       his                  0.4467857
@@ -188,7 +188,7 @@ 

Vector math

## 8 himself 0.3436856 ## 9 arrogant 0.1662236 ## 10 physics 0.1560129
-
demo_vectors %>% nearest_to(~ "she" - "he")
+
demo_vectors %>% closest_to(~ "she" - "he")
##       word similarity to "she" - "he"
 ## 1      she                  0.5749598
 ## 2      her                  0.5707957
@@ -206,7 +206,7 @@ 

Analogies

We can expand out the match to perform analogies. Men tend to be called ‘guys’. What’s the female equivalent? In an SAT-style analogy, you might write he:guy::she:???. In vector math, we think of this as moving between points.

If you’re using the mental framework of positive of ‘similarity’ and negative as ‘dissimilarity,’ you can think of this as starting at “guy”, removing its similarity to “he”, and additing a similarity to “she”.

This yields the answer: the most similar term to “guy” for a woman is “lady.”

-
demo_vectors %>% nearest_to(~ "guy" - "he" + "she")
+
demo_vectors %>% closest_to(~ "guy" - "he" + "she")
##       word similarity to "guy" - "he" + "she"
 ## 1     lady                          0.8851965
 ## 2    woman                          0.7777516
@@ -219,7 +219,7 @@ 

Analogies

## 9 herself 0.4589193 ## 10 mrs 0.4508955

If you’re using the other mental framework, of thinking of these as real vectors, you might phrase this in a slightly different way. You have a gender vector ("female" - "male") that represents the direction of masculinity to femininity. You can then add this vector to “guy”, and that will take you to a new neighborhood. You might phrase that this way: note that the math is exactly equivalent, and only the grouping is different.

-
demo_vectors %>% nearest_to(~ "guy" + ("she" - "he"))
+
demo_vectors %>% closest_to(~ "guy" + ("she" - "he"))
##       word similarity to "guy" + ("she" - "he")
 ## 1     lady                            0.8851965
 ## 2    woman                            0.7777516
@@ -238,13 +238,13 @@ 

Analogies

These lists of ten words at a time are useful for interactive exploration, but sometimes we might want to say ‘n=Inf’ to return the full list. For instance, we can combine these two methods to look at positive and negative words used to evaluate teachers.

First we build up three data_frames: first, a list of the 50 top evaluative words, and then complete lists of similarity to "good" -"bad" and "woman" - "man".

top_evaluative_words = demo_vectors %>% 
-   nearest_to(~ "good"+"bad",n=75)
+   closest_to(~ "good"+"bad",n=75)
 
 goodness = demo_vectors %>% 
-  nearest_to(~ "good"-"bad",n=Inf) 
+  closest_to(~ "good"-"bad",n=Inf) 
 
 femininity = demo_vectors %>% 
-  nearest_to(~ "she" - "he", n=Inf)
+ closest_to(~ "she" - "he", n=Inf)

Then we can use tidyverse packages to join and plot these. An inner_join restricts us down to just those top 50 words, and ggplot can array the words on axes.

library(ggplot2)
 library(dplyr)
diff --git a/inst/doc/introduction.R b/inst/doc/introduction.R
index 2543e42..8c030f0 100644
--- a/inst/doc/introduction.R
+++ b/inst/doc/introduction.R
@@ -26,14 +26,14 @@ if (!file.exists("cookbook_vectors.bin")) {model = train_word2vec("cookbooks.txt
 
 
 ## ------------------------------------------------------------------------
-model %>% nearest_to("fish")
+model %>% closest_to("fish")
 
 ## ------------------------------------------------------------------------
 model %>% 
-  nearest_to(model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],50)
+  closest_to(model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],50)
 
 ## ------------------------------------------------------------------------
-some_fish = nearest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],150)
+some_fish = closest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],150)
 fishy = model[[some_fish$word,average=F]]
 plot(fishy,method="pca")
 
@@ -51,7 +51,7 @@ sapply(sample(1:centers,10),function(n) {
 ingredients = c("madeira","beef","saucepan","carrots")
 term_set = lapply(ingredients, 
        function(ingredient) {
-          nearest_words = model %>% nearest_to(model[[ingredient]],20)
+          nearest_words = model %>% closest_to(model[[ingredient]],20)
           nearest_words$word
         }) %>% unlist
 
diff --git a/inst/doc/introduction.Rmd b/inst/doc/introduction.Rmd
index a3ef10a..6f1185b 100644
--- a/inst/doc/introduction.Rmd
+++ b/inst/doc/introduction.Rmd
@@ -86,14 +86,14 @@ Now we have a model in memory, trained on about 10 million words from 77 cookboo
 Well, you can run some basic operations to find the nearest elements:
 
 ```{r}
-model %>% nearest_to("fish")
+model %>% closest_to("fish")
 ```
 
 With that list, you can expand out further to search for multiple words:
 
 ```{r}
 model %>% 
-  nearest_to(model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],50)
+  closest_to(model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],50)
 ```
 
 Now we have a pretty expansive list of potential fish-related words from old cookbooks. This can be useful for a few different things:
@@ -105,7 +105,7 @@ Now we have a pretty expansive list of potential fish-related words from old coo
 Or we can just arrange them somehow. In this case, it doesn't look like much of anything.
 
 ```{r}
-some_fish = nearest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],150)
+some_fish = closest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],150)
 fishy = model[[some_fish$word,average=F]]
 plot(fishy,method="pca")
 ```
@@ -137,7 +137,7 @@ the 20 words closest to each of four different kinds of words.
 ingredients = c("madeira","beef","saucepan","carrots")
 term_set = lapply(ingredients, 
        function(ingredient) {
-          nearest_words = model %>% nearest_to(model[[ingredient]],20)
+          nearest_words = model %>% closest_to(model[[ingredient]],20)
           nearest_words$word
         }) %>% unlist
 
diff --git a/inst/doc/introduction.html b/inst/doc/introduction.html
index 7119d2e..15510cf 100644
--- a/inst/doc/introduction.html
+++ b/inst/doc/introduction.html
@@ -457,7 +457,7 @@ 

Building test data

Similarity searches

Well, you can run some basic operations to find the nearest elements:

-
model %>% nearest_to("fish")
+
model %>% closest_to("fish")
##              word similarity to "fish"
 ## 1            fish            1.0000000
 ## 2          thames            0.6550232
@@ -471,7 +471,7 @@ 

Similarity searches

## 10 turbot 0.6302442

With that list, you can expand out further to search for multiple words:

model %>% 
-  nearest_to(model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],50)
+ closest_to(model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],50)
##                word
 ## 1             trout
 ## 2              carp
@@ -581,7 +581,7 @@ 

Similarity searches

  • As a source for visualization.
  • Or we can just arrange them somehow. In this case, it doesn’t look like much of anything.

    -
    some_fish = nearest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],150)
    +
    some_fish = closest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],150)
     fishy = model[[some_fish$word,average=F]]
     plot(fishy,method="pca")

    @@ -634,7 +634,7 @@

    Clustering

    ingredients = c("madeira","beef","saucepan","carrots")
     term_set = lapply(ingredients, 
            function(ingredient) {
    -          nearest_words = model %>% nearest_to(model[[ingredient]],20)
    +          nearest_words = model %>% closest_to(model[[ingredient]],20)
               nearest_words$word
             }) %>% unlist
     
    diff --git a/man/closest_to.Rd b/man/closest_to.Rd
    new file mode 100644
    index 0000000..3064393
    --- /dev/null
    +++ b/man/closest_to.Rd
    @@ -0,0 +1,61 @@
    +% Generated by roxygen2: do not edit by hand
    +% Please edit documentation in R/matrixFunctions.R
    +\name{closest_to}
    +\alias{closest_to}
    +\title{Return the n closest words in a VectorSpaceModel to a given vector.}
    +\usage{
    +closest_to(matrix, vector, n = 10, fancy_names = TRUE)
    +}
    +\arguments{
    +\item{matrix}{A matrix or VectorSpaceModel}
    +
    +\item{vector}{A vector (or a string or a formula coercable to a vector)
    +of the same length as the VectorSpaceModel. See below.}
    +
    +\item{n}{The number of closest words to include.}
    +
    +\item{fancy_names}{If true (the default) the data frame will have descriptive names like
    +'similarity to "king+queen-man"'; otherwise, just 'similarity.' The default can speed up
    + interactive exploration.}
    +}
    +\value{
    +A sorted data.frame with columns for the words and their similarity
    +to the target vector. (Or, if as_df==FALSE, a named vector of similarities.)
    +}
    +\description{
    +This is a convenience wrapper around the most common use of
    +'cosineSimilarity'; the listing of several words similar to a given vector.
    +Unlike cosineSimilarity, it returns a data.frame object instead of a matrix.
    +cosineSimilarity is more powerful, because it can compare two matrices to
    +each other; closest_to can only take a vector or vectorlike object as its second argument.
    +But with (or without) the argument n=Inf, closest_to is often better for
    +plugging directly into a plot.
    +
    +As with cosineSimilarity, the second argument can take several forms. If it's a vector or
    +matrix slice, it will be taken literally. If it's a character string, it will
    +be interpreted as a word and the associated vector from `matrix` will be used. If
    +a formula, any strings in the formula will be converted to rows in the associated `matrix`
    +before any math happens.
    +}
    +\examples{
    +
    +# Synonyms and similar words
    +closest_to(demo_vectors,demo_vectors[["good"]])
    +
    +# If 'matrix' is a VectorSpaceModel object,
    +# you can also just enter a string directly, and
    +# it will be evaluated in the context of the passed matrix.
    +
    +closest_to(demo_vectors,"good")
    +
    +# You can also express more complicated formulas.
    +
    +closest_to(demo_vectors,"good")
    +
    +# Something close to the classic king:man::queen:woman;
    +# What's the equivalent word for a female teacher that "guy" is for
    +# a male one?
    +
    +closest_to(demo_vectors,~ "guy" - "man" + "woman")
    +
    +}
    diff --git a/man/cosineSimilarity.Rd b/man/cosineSimilarity.Rd
    index 3aa0ecd..e74263b 100644
    --- a/man/cosineSimilarity.Rd
    +++ b/man/cosineSimilarity.Rd
    @@ -26,9 +26,9 @@ Calculate the cosine similarity of two matrices or a matrix and a vector.
     subjects = demo_vectors[[c("history","literature","biology","math","stats"),average=FALSE]]
     similarities = cosineSimilarity(subjects,subjects)
     
    -# Use 'nearest_to' to build up a large list of similar words to a seed set.
    +# Use 'closest_to' to build up a large list of similar words to a seed set.
     subjects = demo_vectors[[c("history","literature","biology","math","stats"),average=TRUE]]
    -new_subject_list = nearest_to(demo_vectors,subjects,20)
    +new_subject_list = closest_to(demo_vectors,subjects,20)
     new_subjects = demo_vectors[[new_subject_list$word,average=FALSE]]
     
     # Plot the cosineDistance of these as a dendrogram.
    diff --git a/man/distend.Rd b/man/distend.Rd
    index 3ab13d9..ef5356c 100644
    --- a/man/distend.Rd
    +++ b/man/distend.Rd
    @@ -31,11 +31,11 @@ eliminating a vector entirely. Values less than zero will do some type of mirror
     universe thing, but probably aren't useful?
     }
     \examples{
    -nearest_to(demo_vectors,"sweet")
    +closest_to(demo_vectors,"sweet")
     
     # Stretch out the vectorspace 4x longer along the gender direction.
     more_sexist = distend(demo_vectors, ~ "man" + "he" - "she" -"woman", 4)
     
    -nearest_to(more_sexist,"sweet")
    +closest_to(more_sexist,"sweet")
     
     }
    diff --git a/man/improve_vectorspace.Rd b/man/improve_vectorspace.Rd
    index adff851..02e626d 100644
    --- a/man/improve_vectorspace.Rd
    +++ b/man/improve_vectorspace.Rd
    @@ -19,11 +19,11 @@ See reference for a full description. Supposedly, these operations will improve
     }
     \examples{
     
    -nearest_to(demo_vectors,"great")
    +closest_to(demo_vectors,"great")
     # stopwords like "and" and "very" are no longer top ten.
     # I don't know if this is really better, though.
     
    -nearest_to(improve_vectorspace(demo_vectors),"great")
    +closest_to(improve_vectorspace(demo_vectors),"great")
     
     }
     \references{
    diff --git a/man/nearest_to.Rd b/man/nearest_to.Rd
    index 3399829..6964066 100644
    --- a/man/nearest_to.Rd
    +++ b/man/nearest_to.Rd
    @@ -2,62 +2,26 @@
     % Please edit documentation in R/matrixFunctions.R
     \name{nearest_to}
     \alias{nearest_to}
    -\title{Return the n closest words in a VectorSpaceModel to a given vector.}
    +\title{Nearest vectors to a word}
     \usage{
    -nearest_to(matrix, vector, n = 10, as_df = TRUE, fancy_names = TRUE)
    +nearest_to(...)
     }
     \arguments{
    -\item{matrix}{A matrix or VectorSpaceModel}
    -
    -\item{vector}{A vector (or a string or a formula coercable to a vector)
    -of the same length as the VectorSpaceModel. See below.}
    -
    -\item{n}{The number of closest words to include.}
    -
    -\item{as_df}{Return as a data.frame? If false, returns a named vector, for back-compatibility.}
    -
    -\item{fancy_names}{If true (the default) the data frame will have descriptive names like
    -'similarity to "king+queen-man"'; otherwise, just 'similarity.' The default can speed up
    - interactive exploration.}
    +\item{...}{See `closest_to`}
     }
     \value{
    -A sorted data.frame with columns for the words and their similarity
    -to the target vector. (Or, if as_df==FALSE, a named vector of similarities.)
    +a names vector of cosine similarities. See 'nearest_to' for more details.
     }
     \description{
    -This is a convenience wrapper around the most common use of
    -'cosineSimilarity'; the listing of several words similar to a given vector.
    -Unlike cosineSimilarity, it returns a data.frame object instead of a matrix.
    -cosineSimilarity is more powerful, because it can compare two matrices to
    -each other; nearest_to can only take a vector or vectorlike object as its second argument.
    -But with (or without) the argument n=Inf, nearest_to is often better for
    -plugging directly into a plot.
    -
    -As with cosineSimilarity, the second argument can take several forms. If it's a vector or
    -matrix slice, it will be taken literally. If it's a character string, it will
    -be interpreted as a word and the associated vector from `matrix` will be used. If
    -a formula, any strings in the formula will be converted to rows in the associated `matrix`
    -before any math happens.
    +This a wrapper around closest_to, included for back-compatibility. Use
    +closest_to for new applications.
     }
     \examples{
     
    -# Synonyms and similar words
    -nearest_to(demo_vectors,demo_vectors[["good"]])
    -
    -# If 'matrix' is a VectorSpaceModel object,
    -# you can also just enter a string directly, and
    -# it will be evaluated in the context of the passed matrix.
    -
    -nearest_to(demo_vectors,"good")
    -
    -# You can also express more complicated formulas.
    -
    -nearest_to(demo_vectors,"good")
    -
    -# Something close to the classic king:man::queen:woman;
    -# What's the equivalent word for a female teacher that "guy" is for
    -# a male one?
    +# Recommended usage in 1.0:
    +nearest_to(demo_vectors, demo_vectors[["good"]])
     
    -nearest_to(demo_vectors,~ "guy" - "man" + "woman")
    +# Recommended usage in 2.0:
    +demo_vectors \%>\% closest_to("good")
     
     }
    diff --git a/man/reject.Rd b/man/reject.Rd
    index 3ad191e..64f9d60 100644
    --- a/man/reject.Rd
    +++ b/man/reject.Rd
    @@ -25,9 +25,9 @@ See `project` for more details.
     Return a vector rejection for each element in a VectorSpaceModel
     }
     \examples{
    -nearest_to(demo_vectors,demo_vectors[["man"]])
    +closest_to(demo_vectors,demo_vectors[["man"]])
     
     genderless = reject(demo_vectors,demo_vectors[["he"]] - demo_vectors[["she"]])
    -nearest_to(genderless,genderless[["man"]])
    +closest_to(genderless,genderless[["man"]])
     
     }
    diff --git a/tests/testthat/test-name-collapsing.r b/tests/testthat/test-name-collapsing.r
    index 7e11d58..45de6aa 100644
    --- a/tests/testthat/test-name-collapsing.r
    +++ b/tests/testthat/test-name-collapsing.r
    @@ -2,25 +2,25 @@ context("Name collapsing")
     
     test_that("name substitution works",
       expect_equivalent(
    -    demo_vectors %>% nearest_to(~"good")
    +    demo_vectors %>% closest_to(~"good")
         ,
    -    demo_vectors %>% nearest_to(demo_vectors[["good"]])
    +    demo_vectors %>% closest_to(demo_vectors[["good"]])
       )
     )
     
     test_that("character substitution works",
               expect_equivalent(
    -            demo_vectors %>% nearest_to("good")
    +            demo_vectors %>% closest_to("good")
                 ,
    -            demo_vectors %>% nearest_to(demo_vectors[["good"]])
    +            demo_vectors %>% closest_to(demo_vectors[["good"]])
               )
     )
     
     test_that("addition works in substitutions",
               expect_equivalent(
    -            demo_vectors %>% nearest_to(~ "good" + "bad")
    +            demo_vectors %>% closest_to(~ "good" + "bad")
                 ,
    -            demo_vectors %>% nearest_to(demo_vectors[["good"]] + demo_vectors[["bad"]])
    +            demo_vectors %>% closest_to(demo_vectors[["good"]] + demo_vectors[["bad"]])
               )
     )
     
    @@ -32,16 +32,16 @@ test_that("addition provides correct results",
     
     test_that("single-argument negation works",
               expect_equivalent(
    -            demo_vectors %>% nearest_to(~ -("good"-"bad"))
    +            demo_vectors %>% closest_to(~ -("good"-"bad"))
                 ,
    -            demo_vectors %>% nearest_to(~ "bad"-"good")
    +            demo_vectors %>% closest_to(~ "bad"-"good")
     
               ))
     
    -test_that("nearest_to can wrap in function",
    +test_that("closest_to can wrap in function",
               expect_equal(
    -            {function(x) {nearest_to(x,~ "class" + "school")}}(demo_vectors),
    -            nearest_to(demo_vectors,~ "class" + "school")
    +            {function(x) {closest_to(x,~ "class" + "school")}}(demo_vectors),
    +            closest_to(demo_vectors,~ "class" + "school")
               )
     )
     
    @@ -54,7 +54,7 @@ test_that("Name substitution is occurring",
     test_that("reference in functional scope is passed along",
               expect_equivalent(
                 lapply(c("good"),function(referenced_word)
    -              {demo_vectors %>% nearest_to(demo_vectors[[referenced_word]])})[[1]],
    -            demo_vectors %>% nearest_to("good")
    +              {demo_vectors %>% closest_to(demo_vectors[[referenced_word]])})[[1]],
    +            demo_vectors %>% closest_to("good")
              )
     )
    diff --git a/vignettes/exploration.Rmd b/vignettes/exploration.Rmd
    index 663f2ae..fd056b8 100644
    --- a/vignettes/exploration.Rmd
    +++ b/vignettes/exploration.Rmd
    @@ -48,7 +48,7 @@ demo_vectors[["good"]]
     These numbers are meaningless on their own. But in the vector space, we can find similar words.
     
     ```{r}
    -demo_vectors %>% nearest_to(demo_vectors[["good"]])
    +demo_vectors %>% closest_to(demo_vectors[["good"]])
     ```
     
     The `%>%` is the pipe operator from magrittr; it helps to keep things organized, and is particularly useful with some of the things we'll see later. The 'similarity' scores here are cosine similarity in a vector space; 1.0 represents perfect similarity, 0 is no correlation, and -1.0 is complete opposition. In practice, vector "opposition" is different from the colloquial use of "opposite," and very rare. You'll only occasionally see vector scores below 0--as you can see above, "bad" is actually one of the most similar words to "good."
    @@ -56,7 +56,7 @@ The `%>%` is the pipe operator from magrittr; it helps to keep things organized,
     When interactively exploring a single model (rather than comparing *two* models), it can be a pain to keep retyping words over and over. Rather than operate on the vectors, this package also lets you access the word directly by using R's formula notation: putting a tilde in front of it. For a single word, you can even access it directly, as so.
     
     ```{r}
    -demo_vectors %>% nearest_to("bad")
    +demo_vectors %>% closest_to("bad")
     ```
     
     ## Vector math
    @@ -65,16 +65,16 @@ The tildes are necessary syntax where things get interesting--you can do **math*
     
     ```{r}
     
    -demo_vectors %>% nearest_to(~"good"+"bad")
    +demo_vectors %>% closest_to(~"good"+"bad")
     
     # The same thing could be written as:
    -# demo_vectors %>% nearest_to(demo_vectors[["good"]]+demo_vectors[["bad"]])
    +# demo_vectors %>% closest_to(demo_vectors[["good"]]+demo_vectors[["bad"]])
     ```
     
     Those are words that are common to both "good" and "bad". We could also find words that are shaded towards just good but *not* bad by using subtraction.
     
     ```{r}
    -demo_vectors %>% nearest_to(~"good" - "bad")
    +demo_vectors %>% closest_to(~"good" - "bad")
     ```
     
     > What does this "subtraction" vector mean? 
    @@ -88,14 +88,14 @@ demo_vectors %>% nearest_to(~"good" - "bad")
     Again, you can easily switch the order to the opposite: here are a bunch of bad words:
     
     ```{r}
    -demo_vectors %>% nearest_to(~ "bad" - "good")
    +demo_vectors %>% closest_to(~ "bad" - "good")
     ```
     
     All sorts of binaries are captured in word2vec models. One of the most famous, since Mikolov's original word2vec paper, is *gender*. If you ask for similarity to "he"-"she", for example, you get words that appear mostly in a *male* context. Since these examples are from teaching evaluations, after just a few straightforwardly gendered words, we start to get things that only men are ("arrogant") or where there are very few women in the university ("physics")
     
     ```{r}
    -demo_vectors %>% nearest_to(~ "he" - "she")
    -demo_vectors %>% nearest_to(~ "she" - "he")
    +demo_vectors %>% closest_to(~ "he" - "she")
    +demo_vectors %>% closest_to(~ "she" - "he")
     ```
     
     ## Analogies
    @@ -112,7 +112,7 @@ removing its similarity to "he", and additing a similarity to "she".
     This yields the answer: the most similar term to "guy" for a woman is "lady."
     
     ```{r}
    -demo_vectors %>% nearest_to(~ "guy" - "he" + "she")
    +demo_vectors %>% closest_to(~ "guy" - "he" + "she")
     ```
     
     If you're using the other mental framework, of thinking of these as real vectors, 
    @@ -122,7 +122,7 @@ to femininity. You can then add this vector to "guy", and that will take you to
     only the grouping is different.
     
     ```{r}
    -demo_vectors %>% nearest_to(~ "guy" + ("she" - "he"))
    +demo_vectors %>% closest_to(~ "guy" + ("she" - "he"))
     ```
     
     Principal components can let you plot a subset of these vectors to see how they relate. You can imagine an arrow from "he" to "she", from "guy" to "lady", and from "man" to "woman"; all run in roughly the same direction.
    @@ -140,13 +140,13 @@ First we build up three data_frames: first, a list of the 50 top evaluative word
     
     ```{r}
     top_evaluative_words = demo_vectors %>% 
    -   nearest_to(~ "good"+"bad",n=75)
    +   closest_to(~ "good"+"bad",n=75)
     
     goodness = demo_vectors %>% 
    -  nearest_to(~ "good"-"bad",n=Inf) 
    +  closest_to(~ "good"-"bad",n=Inf) 
     
     femininity = demo_vectors %>% 
    -  nearest_to(~ "she" - "he", n=Inf)
    +  closest_to(~ "she" - "he", n=Inf)
     ```
     
     Then we can use tidyverse packages to join and plot these.
    diff --git a/vignettes/introduction.Rmd b/vignettes/introduction.Rmd
    index a3ef10a..6f1185b 100644
    --- a/vignettes/introduction.Rmd
    +++ b/vignettes/introduction.Rmd
    @@ -86,14 +86,14 @@ Now we have a model in memory, trained on about 10 million words from 77 cookboo
     Well, you can run some basic operations to find the nearest elements:
     
     ```{r}
    -model %>% nearest_to("fish")
    +model %>% closest_to("fish")
     ```
     
     With that list, you can expand out further to search for multiple words:
     
     ```{r}
     model %>% 
    -  nearest_to(model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],50)
    +  closest_to(model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],50)
     ```
     
     Now we have a pretty expansive list of potential fish-related words from old cookbooks. This can be useful for a few different things:
    @@ -105,7 +105,7 @@ Now we have a pretty expansive list of potential fish-related words from old coo
     Or we can just arrange them somehow. In this case, it doesn't look like much of anything.
     
     ```{r}
    -some_fish = nearest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],150)
    +some_fish = closest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],150)
     fishy = model[[some_fish$word,average=F]]
     plot(fishy,method="pca")
     ```
    @@ -137,7 +137,7 @@ the 20 words closest to each of four different kinds of words.
     ingredients = c("madeira","beef","saucepan","carrots")
     term_set = lapply(ingredients, 
            function(ingredient) {
    -          nearest_words = model %>% nearest_to(model[[ingredient]],20)
    +          nearest_words = model %>% closest_to(model[[ingredient]],20)
               nearest_words$word
             }) %>% unlist