Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

doc: escaping quotes requires quoted queries (fix #185) #189

Merged
merged 3 commits into from
Jan 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,28 @@ best_doc = searcher.doc(best_doc_address)
Note: for integer search, the integer field should be indexed.

For more possible query formats and possible query options, see [Tantivy Query Parser Docs.](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html)

## Escape quotes inside a query string

The tantivy docs for the query parser say that special characters like quotes can be
escaped inside query values. However, it will also be necessary to surround
the search query in additional quotes, as if a phrase query were being used.

The following will NOT work:

```python
# Raises ValueError
index.parse_query(r'sea\"', ["title", "body"])
```

However, the following will succeed:

```python
# Works!
index.parse_query(r'"sea\""', ["title", "body"])
```

Note that whether the included (and escaped) quote actually gets used
to match documents depends on the tokenizer used for the field. For example,
the default tokenizer will not match the document "sea\"s" with the query
"sea\"", because this tokenizer discards punctuation.
191 changes: 191 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import pytest

from tantivy import SchemaBuilder, Index, Document


def schema():
return (
SchemaBuilder()
.add_text_field("title", stored=True)
.add_text_field("body")
.build()
)


def schema_numeric_fields():
return (
SchemaBuilder()
.add_integer_field("id", stored=True, indexed=True)
.add_float_field("rating", stored=True, indexed=True)
.add_boolean_field("is_good", stored=True, indexed=True)
.add_text_field("body", stored=True)
.build()
)


def create_index(dir=None):
# assume all tests will use the same documents for now
# other methods may set up function-local indexes
index = Index(schema(), dir)
writer = index.writer(15_000_000, 1)

# 2 ways of adding documents
# 1
doc = Document()
# create a document instance
# add field-value pairs
doc.add_text("title", "The Old Man and the Sea")
doc.add_text(
"body",
(
"He was an old man who fished alone in a skiff in"
"the Gulf Stream and he had gone eighty-four days "
"now without taking a fish."
),
)
writer.add_document(doc)
# 2 use the built-in json support
# keys need to coincide with field names
doc = Document.from_dict(
{
"title": "Of Mice and Men",
"body": (
"A few miles south of Soledad, the Salinas River drops "
"in close to the hillside bank and runs deep and "
"green. The water is warm too, for it has slipped "
"twinkling over the yellow sands in the sunlight "
"before reaching the narrow pool. On one side of the "
"river the golden foothill slopes curve up to the "
"strong and rocky Gabilan Mountains, but on the valley "
"side the water is lined with trees—willows fresh and "
"green with every spring, carrying in their lower leaf "
"junctures the debris of the winter’s flooding; and "
"sycamores with mottled, white, recumbent limbs and "
"branches that arch over the pool"
),
}
)
writer.add_document(doc)
writer.add_json(
"""{
"title": ["Frankenstein", "The Modern Prometheus"],
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
}"""
)
writer.commit()
index.reload()
return index


def create_index_with_numeric_fields(dir=None):
index = Index(schema_numeric_fields(), dir)
writer = index.writer(15_000_000, 1)

doc = Document()
doc.add_integer("id", 1)
doc.add_float("rating", 3.5)
doc.add_boolean("is_good", True)
doc.add_text(
"body",
(
"He was an old man who fished alone in a skiff in"
"the Gulf Stream and he had gone eighty-four days "
"now without taking a fish."
),
)
writer.add_document(doc)
doc = Document.from_dict(
{
"id": 2,
"rating": 4.5,
"is_good": False,
"body": (
"A few miles south of Soledad, the Salinas River drops "
"in close to the hillside bank and runs deep and "
"green. The water is warm too, for it has slipped "
"twinkling over the yellow sands in the sunlight "
"before reaching the narrow pool. On one side of the "
"river the golden foothill slopes curve up to the "
"strong and rocky Gabilan Mountains, but on the valley "
"side the water is lined with trees—willows fresh and "
"green with every spring, carrying in their lower leaf "
"junctures the debris of the winter’s flooding; and "
"sycamores with mottled, white, recumbent limbs and "
"branches that arch over the pool"
),
},
)
writer.add_document(doc)
writer.commit()
index.reload()
return index


def spanish_schema():
return (
SchemaBuilder()
.add_text_field("title", stored=True, tokenizer_name="es_stem")
.add_text_field("body", tokenizer_name="es_stem")
.build()
)


def create_spanish_index():
# assume all tests will use the same documents for now
# other methods may set up function-local indexes
index = Index(spanish_schema(), None)
writer = index.writer()

# 2 ways of adding documents
# 1
doc = Document()
# create a document instance
# add field-value pairs
doc.add_text("title", "El viejo y el mar")
doc.add_text(
"body",
(
"Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
),
)
writer.add_document(doc)
# 2 use the built-in json support
# keys need to coincide with field names
doc = Document.from_dict(
{
"title": "De ratones y hombres",
"body": (
"Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque"
),
}
)
writer.add_document(doc)
writer.add_json(
"""{
"title": ["Frankenstein", "El moderno Prometeo"],
"body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa."
}"""
)
writer.commit()
index.reload()
return index


@pytest.fixture()
def dir_index(tmpdir):
return (tmpdir, create_index(str(tmpdir)))


@pytest.fixture(scope="class")
def ram_index():
return create_index()


@pytest.fixture(scope="class")
def ram_index_numeric_fields():
return create_index_with_numeric_fields()


@pytest.fixture(scope="class")
def spanish_index():
return create_spanish_index()
Loading
Loading