Skip to content

Implementation Advanced Search #175

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[flake8]
max-line-length = 120
exclude = .venv,dist,docs,build,*.egg,.git,__pycache__
exclude = .venv,dist,docs,build,*.egg,.git,__pycache__,venv
count = true
show-source = true
statistics = true
5 changes: 3 additions & 2 deletions pycaching/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -1270,7 +1270,7 @@ class Type(enum.Enum):
project_ape = "9"
geocaching_hq = groundspeak_hq = "3773"
gps_adventures_exhibit = gps_maze = "1304"
groundspeak_block_party = "4738"
geocaching_hq_block_party = groundspeak_block_party = "4738"
locationless = reverse = "12"
hq_celebration = "3774"

Expand Down Expand Up @@ -1323,7 +1323,8 @@ def from_string(cls, name):
"geocaching hq": cls.geocaching_hq,
"groundspeak hq": cls.geocaching_hq,
"gps adventures exhibit": cls.gps_adventures_exhibit,
"groundspeak block party": cls.groundspeak_block_party,
"groundspeak block party": cls.geocaching_hq_block_party,
"geocaching hq block party": cls.geocaching_hq_block_party,
"locationless (reverse)": cls.locationless,
"geocaching hq celebration": cls.hq_celebration,
"community celebration event": cls.community_celebration,
Expand Down
293 changes: 200 additions & 93 deletions pycaching/geocaching.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import requests
from bs4.element import Script

from pycaching.cache import Cache, Size
from pycaching.cache import Cache
from pycaching.errors import Error, LoginFailedException, NotLoggedInException, PMOnlyException, TooManyRequestsError
from pycaching.geo import Point, Rectangle
from pycaching.log import Log
Expand Down Expand Up @@ -239,115 +239,222 @@ def get_logged_user(self, login_page=None):
m = re.search(r'"username":\s*"(.*)"', js_content)
return m.group(1) if m else None

def search(self, point, limit=float("inf")):
def advanced_search(
self,
options,
sort_by: Union[str, SortOrder] = SortOrder.distance,
reverse=False,
limit=float("inf"),
format="cache",
):
"""
Use the advanced search for www.geocaching.com to search for caches.

The function returns a generator, which yield objects depends on the argument format.

For basic members the advanced search is very limited.

The advanced search doesn't make any consistence checks of the search parameters

Formats:
- raw returns the raw response from the request
- json returns
- html returns onle HTMLString
- soup ?
- dict parse the HTML string and returns results in single dicts
- cache parse the HTML string and returns results in single Cache objects

How argument limit works: For format dict and cache it limits the number of returned items exactly.
For format raw, json, html, soup is returns items ceil to the next 50.

:param options:
:param sort_by:
:param reverse: Reverse the sort order of the results
:param limit:
:param format:
:return:
"""
"""Return a generator of caches around some point.

Search for caches around some point by loading search pages and parsing the data from these
pages. Yield :class:`.Cache` objects filled with data from search page. You can provide limit
as a convenient way to stop generator after certain number of caches.

:param .geo.Point point: Search center point.
:param int limit: Maximum number of caches to generate.



formats:
raw: raw data from the server
rawjson: raw data as json
html: raw html
json: parsed html as json

items: each result as a dict
caches: each result as a cache object

:param options:
:param sort_by: Order cached by given criterion.
:param reverse: Change the sequence of the given ...
:param limit: a value between 1 and 1000 or None (unsupported)
:param format:
:return:
"""
logging.info("Searching at {}".format(point))
logging.info(f"Searching with {options}, sort={sort_by}, reverse={reverse}, limit={limit}, format={format}")

start_index = 0
while True:
# get one page
geocaches_table, whole_page = self._search_get_page(point, start_index)
rows = geocaches_table.find_all("tr")
params = options.copy()

# leave loop if there are no (more) results
if not rows:
return
if "startIndex" not in params:
params["startIndex"] = 0
if "selectAll" not in params:
params["selectAll"] = "false"

# prepare language-dependent mappings
if start_index == 0:
cache_sizes_filter_wrapper = whole_page.find("div", class_="cache-sizes-wrapper")
localized_size_mapping = {
# key = "Small" (localized), value = Size.small
label.find("span").text.strip(): Size.from_number(label.find("input").get("value"))
for label in cache_sizes_filter_wrapper.find_all("label")
}

# parse caches in result
for start_index, row in enumerate(rows, start_index):

limit -= 1 # handle limit
if limit < 0:
return

# parse raw data
cache_details = row.find("span", "cache-details").text.split("|")
wp = cache_details[1].strip()

# create and fill cache object
# values are sanitized and converted in Cache setters
c = Cache(self, wp)
c.type = cache_details[0]
c.name = row.find("span", "cache-name").text
badge = row.find("svg", class_="badge")
c.found = "found" in str(badge) if badge is not None else False
c.favorites = row.find(attrs={"data-column": "FavoritePoint"}).text
c.state = not (row.get("class") and "disabled" in row.get("class"))
c.pm_only = row.find("td", "pm-upsell") is not None

if c.pm_only:
# PM only caches doesn't have other attributes filled in
yield c
if not isinstance(sort_by, SortOrder):
sort_by = SortOrder(sort_by)

params.update({"asc": not reverse, "sort": sort_by.value})

load_more = True
while load_more:
if limit <= 0:
break

logging.debug(f"Search parameters {params}")
res = self._request(self._urls["search_more"], params=params, expect="raw")
res_json = res.json()

if not res_json["ShowLoadMore"]:
if res_json["HtmlString"].strip() == "":
break
load_more = False

params["startIndex"] += 50

if format == "raw":
limit -= 50
yield res.text
elif format == "json":
limit -= 50
yield res_json
elif format == "html":
limit -= 50
yield res_json["HtmlString"]
elif format == "soup":
limit -= 50
yield bs4.BeautifulSoup(res_json["HtmlString"].strip(), "html.parser")
elif format == "dict":
for item in self._parser_html_search_results(res_json["HtmlString"]):
if limit <= 0:
return None
limit -= 1
yield item
elif format == "cache":
for item in self._parser_html_search_results(res_json["HtmlString"]):
if limit <= 0:
return None
limit -= 1

properties = {
"type": item["type"],
"name": item["name"],
"favorites": item["FavoritePoint"],
"author": item["owner"],
"pm_only": item["pm_only"],
"found": item["found"],
"state": item["state"],
}
if not item["pm_only"]:
properties.update(
{
"difficulty": item["Difficulty"],
"terrain": item["Terrain"],
"hidden": item["PlaceDate"],
"size": item[
"ContainerSize"
], # TODO I18N issue -> should handled in class Cache or by container size class
}
)

yield Cache(self, item["code"], **properties)
else:
raise Exception(f"unknown output format ({format})")

return None

@staticmethod
def _parser_html_search_results(html):
"""
Prase the HTML string from advanced search and returns for each HTML table row an dict

:param html:
:return: A list of dicts
"""
data = []
dom = bs4.BeautifulSoup(html.strip(), "html.parser")

for table_row in dom.find_all("tr"):
cells = table_row.find_all("td")
cache = {
"number": int(table_row.attrs["data-rownumber"]),
"code": table_row.attrs["data-id"],
"name": table_row.attrs["data-name"],
}

badge = table_row.find("svg", class_="badge")
cache["found"] = "found" in str(badge) if badge is not None else False

cache["pm_only"] = False
for cell in cells:
if "class" not in cell.attrs:
continue
if "cache-primary-details" in cell.attrs["class"]:
details = cell.find_all("span", attrs={"cache-details"})[0].text
cache["type"] = details.split("|")[0].strip()
cache["owner"] = cell.find_all("span", attrs={"owner"})[0].text[
3:
] # delete "by " I18N issue / TODO use I18N helper if avaialble
status = cell.find_all("span", attrs={"status"})
cache["state"] = (
status[0].text if status else "0"
) # ist 0 der richtige wert ??? TODO archived / past by events I18N issue
elif "pm-upsell" in cell.attrs["class"]:
cache["pm_only"] = True
elif "data-column" in cell.attrs:
cache[cell.attrs["data-column"]] = cell.text.strip()
else:
pass

c.size = localized_size_mapping[row.find(attrs={"data-column": "ContainerSize"}).text.strip()]
c.difficulty = row.find(attrs={"data-column": "Difficulty"}).text
c.terrain = row.find(attrs={"data-column": "Terrain"}).text
c.hidden = row.find(attrs={"data-column": "PlaceDate"}).text
c.author = row.find("span", "owner").text[3:] # delete "by "
# 10 cells for premium members
# 8 cells for basic members and 4 cells is the result a PM only cache
if (
len(cells) != 10 and len(cells) != 8 and len(cells) != 4
): # TODO premium 10 / basic 8 / basic only premium 4
raise Exception(
f"Number of cells in row-number={cache['number']} unexpected (cells={len(cells)}) ({cache['name']})"
)

logging.debug("Cache parsed: {}".format(c))
yield c
data.append(cache)
return data

start_index += 1
def search(self, point, limit=float("inf")):
"""Return a generator of caches around some point.

Search for caches around some point by loading search pages and parsing the data from these
pages. Yield :class:`.Cache` objects filled with data from search page. You can provide limit
as a convenient way to stop generator after certain number of caches.

def _search_get_page(self, point, start_index):
"""Return one page for standard search as class:`bs4.BeautifulSoup` object.
This method uses method advanced_search and available for backward compatibility.

:param .geo.Point point: Search center point.
:param int start_index: Determines the page. If start_index is greater than zero, this
method will use AJAX andpoint which is much faster.
"""
assert hasattr(point, "format") and callable(point.format)
logging.debug("Loading page from start_index {}".format(start_index))

if start_index == 0:
# first request has to load normal search page
logging.debug("Using normal search endpoint")

# make request
res = self._request(
self._urls["search"],
params={
"origin": point.format_decimal(),
},
)
return res.find(id="geocaches"), res

else:
# other requests can use AJAX endpoint
logging.debug("Using AJAX search endpoint")

# make request
res = self._request(
self._urls["search_more"],
params={
"origin": point.format_decimal(),
"startIndex": start_index,
"ssvu": 2,
"selectAll": "false",
},
expect="json",
)

return bs4.BeautifulSoup(res["HtmlString"].strip(), "html.parser"), None
logging.info("Searching at {}".format(point))
results = self.advanced_search(
options={
"origin": point.format_decimal(),
},
limit=limit,
format="cache",
)
return results

def search_quick(self, area, *, strict=False, zoom=None):
"""Return a generator of caches in some area.
Expand Down Expand Up @@ -391,7 +498,7 @@ def search_rect(
per_query: int = 200,
sort_by: Union[str, SortOrder] = SortOrder.date_last_visited,
origin: Optional[Point] = None,
wait_sleep: bool = True
wait_sleep: bool = True,
):
"""
Return a generator of caches in given Rectange area.
Expand Down
89 changes: 89 additions & 0 deletions test/cassettes/advanced_search.json

Large diffs are not rendered by default.

1,765 changes: 1,765 additions & 0 deletions test/cassettes/advanced_search_1000.json

Large diffs are not rendered by default.

173 changes: 173 additions & 0 deletions test/cassettes/advanced_search_caches_owned_by_hq.json

Large diffs are not rendered by default.

105 changes: 14 additions & 91 deletions test/cassettes/geocaching_search.json

Large diffs are not rendered by default.

Loading