diff --git a/python-threatexchange/threatexchange/exchanges/clients/ncmec/hash_api.py b/python-threatexchange/threatexchange/exchanges/clients/ncmec/hash_api.py index 316b45861..380927089 100644 --- a/python-threatexchange/threatexchange/exchanges/clients/ncmec/hash_api.py +++ b/python-threatexchange/threatexchange/exchanges/clients/ncmec/hash_api.py @@ -19,7 +19,7 @@ import urllib.parse import requests -from requests.packages.urllib3.util.retry import Retry +from urllib3.util.retry import Retry from threatexchange.exchanges.clients.utils.common import TimeoutHTTPAdapter @@ -123,6 +123,19 @@ class NCMECEntryType(Enum): video = "video" +@unique +class NCMECFeedbackType(Enum): + md5 = "MD5" + sha1 = "SHA1" + pdna = "PDNA" + pdq = "PDQ" + netclean = "NETCLEAN" + videntifier = "VIDENTIFIER" + tmk_pdqf = "TMK_PDQF" + ssvh_pdna = "SSVH_PDNA" + ssvh_safer_hash = "SSVH_SAFER_HASH" + + @dataclass class NCMECEntryUpdate: id: str @@ -131,6 +144,7 @@ class NCMECEntryUpdate: deleted: bool classification: t.Optional[str] fingerprints: t.Dict[str, str] + feedback: t.List[t.Dict[str, t.Any]] @classmethod def from_xml(cls, xml: _XMLWrapper) -> "NCMECEntryUpdate": @@ -148,6 +162,36 @@ def from_xml(cls, xml: _XMLWrapper) -> "NCMECEntryUpdate": fingerprints={ x.tag: x.text for x in xml.maybe("fingerprints") if x.has_text }, + feedback=( + [ + { + "sentiment": x.tag, # "affirmativeFeedback" or "negativeFeedback" + "type": x.str("type"), + "latest_feedback_time": x.str("lastUpdateTimestamp"), + "members": [ + {"id": m.str("id"), "name": m.text} + for m in x.maybe("members") + if m.has_text + ], + "reasons": [ + { + "guid": r.maybe("reason").str("guid"), + "name": r.maybe("reason").str("name"), + "type": r.maybe("reason").str("type"), + "members": [ + {"id": m.str("id"), "name": m.text} + for m in x.maybe("members") + ], + } + for r in x.maybe("reasons") + if r.maybe("reason") + ], + } + for x in xml.maybe("feedback") + ] + if xml.maybe("feedback").has_text + else [] + ), ) @@ -215,11 +259,49 @@ def estimated_entries_in_range(self) -> int: ) +# TODO: once we know the shape of response, finish this class +@dataclass +class UpdateEntryResponse: + updates: t.List[NCMECEntryUpdate] + + @classmethod + def from_xml( + cls, xml: _XMLWrapper, fallback_max_time: int + ) -> "UpdateEntryResponse": + updates: t.List[NCMECEntryUpdate] = [] + + for content_xml in (xml.maybe("images"), xml.maybe("videos")): + if not content_xml or not len(content_xml): + continue + updates.extend(NCMECEntryUpdate.from_xml(c) for c in content_xml) + + return cls(updates) + + +@dataclass +class GetFeedbackReasonsResponse: + reasons: t.List[t.Dict[str, str]] + + @classmethod + def from_xml(cls, xml: _XMLWrapper) -> "GetFeedbackReasonsResponse": + reasons = [] + for reason in xml.maybe("availableFeedbackReasons"): + reasons.append( + { + "guid": reason.str("guid"), + "name": reason.str("name"), + "type": reason.str("type"), + } + ) + return cls(reasons) + + @unique class NCMECEndpoint(Enum): status = "status" entries = "entries" members = "members" + feedback = "feedback" class NCMECEnvironment(Enum): @@ -261,15 +343,19 @@ def __init__( username: str, password: str, environment: NCMECEnvironment, + member_id: t.Optional[str] = None, + reasons_map: t.Dict[str, t.List[t.Dict[str, str]]] = {}, ) -> None: assert is_valid_user_pass(username, password) self.username = username self.password = password self._base_url = environment.value + self.member_id = member_id + self.reasons_map = reasons_map or {} def _get_session(self) -> requests.Session: """ - Custom requests sesson + Custom requests session Ideally, should be used within a context manager: ``` @@ -295,7 +381,9 @@ def _get_session(self) -> requests.Session: ) return session - def _get(self, endpoint: NCMECEndpoint, *, next_: str = "", **params) -> ET.Element: + def _get( + self, endpoint: NCMECEndpoint, *, path: str = "", next_: str = "", **params + ) -> ET.Element: """ Perform an HTTP GET request, and return the XML response payload. @@ -303,6 +391,8 @@ def _get(self, endpoint: NCMECEndpoint, *, next_: str = "", **params) -> ET.Elem """ url = "/".join((self._base_url, self.VERSION, endpoint.value)) + if path: + url = "/".join((url, path)) if next_: url = self._base_url + next_ params = {} @@ -328,16 +418,49 @@ def _post(self, endpoint: NCMECEndpoint, *, data=None) -> t.Any: No timeout or retry strategy. """ - url = "/".join((self._base_url, endpoint.value)) + url = "/".join((self._base_url, self.VERSION, endpoint.value)) with self._get_session() as session: response = session.post(url, data=data) response.raise_for_status() return response + def _put( + self, + endpoint: NCMECEndpoint, + *, + member_id: t.Optional[str] = None, + entry_id: t.Optional[str] = None, + feedback_type: t.Optional[NCMECFeedbackType] = None, + data=None, + ) -> t.Any: + """ + Perform an HTTP PUT request, and return the XML response payload. + + No timeout or retry strategy. + """ + + url = "/".join((self._base_url, self.VERSION, endpoint.value)) + if feedback_type and member_id and entry_id: + url = "/".join( + ( + self._base_url, + endpoint.value, + member_id, + entry_id, + feedback_type.value, + NCMECEndpoint.feedback.value, + ) + ) + with self._get_session() as session: + response = session.put(url, data=data) + response.raise_for_status() + return response + def status(self) -> StatusResult: """Query the status endpoint, which tells you who you are.""" response = self._get(NCMECEndpoint.status) member = _XMLWrapper(response)["member"] + self.member_id = member.str("id") return StatusResult(member.int("id"), member.text) def members(self) -> t.List[StatusResult]: @@ -348,6 +471,17 @@ def members(self) -> t.List[StatusResult]: for member in _XMLWrapper(response) ] + def feedback_reasons(self) -> GetFeedbackReasonsResponse: + """Get the possible negative feedback reasons for each feedback type""" + for feedbackType in NCMECFeedbackType: + resp = self._get( + NCMECEndpoint.feedback, path=f"{feedbackType.value}/reasons" + ) + reasonsResp = GetFeedbackReasonsResponse.from_xml(_XMLWrapper(resp)) + self.reasons_map[feedbackType.value] = reasonsResp.reasons + + return reasonsResp + def get_entries( self, *, @@ -401,6 +535,55 @@ def get_entries_iter( has_more = bool(next_) yield result + def submit_feedback( + self, + entry_id: str, + feedback_type: NCMECFeedbackType, + affirmative: bool, + reason_id: t.Optional[str] = None, + ) -> GetEntriesResponse: + if not affirmative and not reason_id: + raise ValueError("Negative feedback must have a reason_id") + + # need member_id to submit feedback + if not self.member_id: + self.status() + + # need valid reasons to submit negative feedback + if not affirmative and not self.reasons_map: + self.feedback_reasons() + + # Prepare the XML payload + root = ET.Element("feedbackSubmission") + root.set("xmlns", "https://hashsharing.ncmec.org/hashsharing/v2") + vote = ET.SubElement(root, "affirmative" if affirmative else "negative") + + if not affirmative: + valid_reason_ids = [ + reason["guid"] for reason in self.reasons_map[feedback_type.value] + ] + if reason_id not in valid_reason_ids: + print( + "must choose from the following reasons: ", + self.reasons_map[feedback_type.value], + ) + raise ValueError("Invalid reason_id") + reasons = ET.SubElement(vote, "reasonIds") + guid = ET.SubElement(reasons, "guid") + guid.text = reason_id + # ET.dump(root) + + resp = self._put( + NCMECEndpoint.entries, + member_id=self.member_id, + entry_id=entry_id, + feedback_type=feedback_type, + data=ET.tostring(root), + ) + + # TODO: parse response here once we know the shape using UpdateEntryResponse + return resp + def _date_format(timestamp: int) -> str: """ISO 8601 format yyyy-MM-dd'T'HH:mm:ss.SSSZ""" diff --git a/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/data.py b/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/data.py new file mode 100644 index 000000000..21b7a9ca5 --- /dev/null +++ b/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/data.py @@ -0,0 +1,220 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. + +STATUS_XML = """ + + + 127.0.0.1 + testington + Sir Testington + +""".strip() + +NEXT_UNESCAPED = ( + "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z" + "&to=2017-10-30T00%3A00%3A00.000Z&start=2001&size=1000&max=3000" +) + +NEXT_UNESCAPED2 = ( + "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z" + "&to=2017-10-30T00%3A00%3A00.000Z&start=3001&size=1000&max=4000" +) +NEXT_UNESCAPED3 = ( + "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z" + "&to=2017-10-30T00%3A00%3A00.000Z&start=4001&size=1000&max=5000" +) + +ENTRIES_XML = """ + + + + + Example Member + 2017-10-24T15:00:00Z + image1 + A1 + + a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 + a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 + a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1... + + + + + Example Member + + + + + + + Example Member + + + + + + + Example Member2 + image4 + 2017-10-24T15:10:00Z + + + + + + Example Member + video4 + 2017-10-24T15:20:00Z + + + + /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=2001&size=1000&max=3000 + + +""".strip() + + +ENTRIES_XML2 = """ + + + + + Example Member + 2019-10-24T15:00:00Z + image10 + A1 + + b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 + b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 + b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1... + + + + + + /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=3001&size=1000&max=4000 + + +""".strip() + +# This example isn't in the documentation, but shows how updates work +ENTRIES_XML3 = """ + + + + + + + + /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=4001&size=1000&max=5000 + + +""".strip() + +ENTRIES_XML4 = """ + + + + + + TX Example + 2019-11-25T15:10:00Z + willdelete + + + +""".strip() + +ENTRIES_LARGE_FINGERPRINTS = """ + + + + + + +""".strip() + +STATUS_XML = """ + + + 1.1.1.1 + test_user + test member + +""".strip() + +FEEDBACK_REASONS_XML = """ + + + + +""".strip() + +AFFIRMATIVE_FEEDBACK_XML = """ + + + + + + +""".strip() + +NEGATIVE_FEEDBACK_XML = """ + + + + + 01234567-abcd-0123-4567-012345678900 + + + +""".strip() + +UPDATE_FEEDBACK_RESULT_XML = """ + + + + +""".strip() diff --git a/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/test_hash_api.py b/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/test_hash_api.py index 89968543a..ba946d504 100644 --- a/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/test_hash_api.py +++ b/python-threatexchange/threatexchange/exchanges/clients/ncmec/tests/test_hash_api.py @@ -1,177 +1,29 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. from unittest.mock import Mock -import urllib.parse import typing as t import pytest import requests from threatexchange.exchanges.clients.ncmec.hash_api import ( NCMECEntryType, NCMECEntryUpdate, + NCMECFeedbackType, NCMECHashAPI, NCMECEnvironment, ) - -STATUS_XML = """ - - - 127.0.0.1 - testington - Sir Testington - -""".strip() - -NEXT_UNESCAPED = ( - "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z" - "&to=2017-10-30T00%3A00%3A00.000Z&start=2001&size=1000&max=3000" -) - -NEXT_UNESCAPED2 = ( - "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z" - "&to=2017-10-30T00%3A00%3A00.000Z&start=3001&size=1000&max=4000" -) -NEXT_UNESCAPED3 = ( - "/v2/entries?from=2017-10-20T00%3A00%3A00.000Z" - "&to=2017-10-30T00%3A00%3A00.000Z&start=4001&size=1000&max=5000" +from threatexchange.exchanges.clients.ncmec.tests.data import ( + ENTRIES_LARGE_FINGERPRINTS, + ENTRIES_XML, + ENTRIES_XML2, + ENTRIES_XML3, + ENTRIES_XML4, + NEXT_UNESCAPED, + NEXT_UNESCAPED2, + NEXT_UNESCAPED3, + STATUS_XML, + UPDATE_FEEDBACK_RESULT_XML, ) -ENTRIES_XML = """ - - - - - Example Member - 2017-10-24T15:00:00Z - image1 - A1 - - a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 - a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 - a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1... - - - - Example Member2 - image4 - 2017-10-24T15:10:00Z - - - - - - Example Member - video4 - 2017-10-24T15:20:00Z - - - - /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=2001&size=1000&max=3000 - - -""".strip() - - -ENTRIES_XML2 = """ - - - - - Example Member - 2019-10-24T15:00:00Z - image10 - A1 - - b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 - b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1 - b1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1... - - - - - /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=3001&size=1000&max=4000 - - -""".strip() - -# This example isn't in the documentation, but shows how updates work -ENTRIES_XML3 = """ - - - - - - - - /v2/entries?from=2017-10-20T00%3A00%3A00.000Z&to=2017-10-30T00%3A00%3A00.000Z&start=4001&size=1000&max=5000 - - -""".strip() - -ENTRIES_XML4 = """ - - - - - - TX Example - 2019-11-25T15:10:00Z - willdelete - - - -""".strip() - -ENTRIES_LARGE_FINGERPRINTS = """ - - - - - - -""".strip() - def mock_get_impl(url: str, **params): content = ENTRIES_XML @@ -323,3 +175,35 @@ def test_large_fingerprint_entries(monkeypatch): assert len(update.fingerprints) == 1 assert update.fingerprints == {"md5": "facefacefacefacefacefacefaceface"} assert result.next == "" + + +def test_feedback_entries(monkeypatch): + api = NCMECHashAPI( + "fake_user", + "fake_pass", + NCMECEnvironment.test_Industry, + member_id="123", + reasons_map={ + NCMECFeedbackType.md5.value: [ + { + "guid": "01234567-abcd-0123-4567-012345678900", + "name": "Example Reason 1", + "type": "Sha1", + } + ] + }, + ) + session = Mock( + strict_spec=["put", "__enter__", "__exit__"], + put=set_api_return(UPDATE_FEEDBACK_RESULT_XML), + __enter__=lambda _: session, + __exit__=lambda *args: None, + ) + monkeypatch.setattr(api, "_get_session", lambda: session) + + result = api.submit_feedback("image1", NCMECFeedbackType.md5, True) + result = api.submit_feedback( + "image1", NCMECFeedbackType.md5, False, "01234567-abcd-0123-4567-012345678900" + ) + + assert result.status_code == 200