diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..1dbc217 --- /dev/null +++ b/.flake8 @@ -0,0 +1,5 @@ +[flake8] + ignore = E203, E266, E501, W503, F403, F401 + max-line-length = 89 + max-complexity = 18 + select = B,C,E,F,W,T4,B9 diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml deleted file mode 100644 index 60c3da9..0000000 --- a/.github/workflows/pylint.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: linting - -on: [push] - -jobs: - build: - - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.8", "3.9", "3.10"] - - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - make install - - name: Analysing the code with pylint - run: | - pylint $(git ls-files '*.py') diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 236d00c..315283f 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.10", "3.11"] steps: - uses: actions/checkout@v3 @@ -25,9 +25,9 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - make install + flit install - name: Test with pytest env: MC_API_TOKEN: ${{ secrets.MC_API_TOKEN }} run: | - make test + pytest diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..6e5649d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,26 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: + - repo: http://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + - id: check-json + - id: check-toml +# - repo: http://github.com/pre-commit/mirrors-mypy +# rev: v1.6.1 +# hooks: +# - id: mypy +# entry: mypy + - repo: http://github.com/pycqa/flake8 + rev: 6.1.0 + hooks: + - id: flake8 + - repo: http://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + name: isort (python) diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index 58145bd..0000000 --- a/.pylintrc +++ /dev/null @@ -1,566 +0,0 @@ -[MASTER] - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code -extension-pkg-whitelist= - -# Add files or directories to the blacklist. They should be base names, not -# paths. -ignore= - -# Add files or directories matching the regex patterns to the blacklist. The -# regex matches against base names, not paths. -ignore-patterns= - -# Python code to execute, usually for sys.path manipulation such as -# pygtk.require(). -#init-hook= - -# Use multiple processes to speed up Pylint. -jobs=1 - -# List of plugins (as comma separated values of python modules names) to load, -# usually to register additional checkers. -load-plugins= - -# Pickle collected data for later comparisons. -persistent=yes - -# Specify a configuration file. -#rcfile= - -# When enabled, pylint would attempt to guess common misconfiguration and emit -# user-friendly hints instead of false-positive error messages -suggestion-mode=yes - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED -confidence= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once).You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use"--disable=all --enable=classes -# --disable=W" -disable=print-statement, - parameter-unpacking, - unpacking-in-except, - old-raise-syntax, - backtick, - long-suffix, - old-ne-operator, - old-octal-literal, - import-star-module-level, - non-ascii-bytes-literal, - invalid-unicode-literal, - raw-checker-failed, - bad-inline-option, - locally-disabled, - locally-enabled, - file-ignored, - suppressed-message, - useless-suppression, - deprecated-pragma, - apply-builtin, - basestring-builtin, - buffer-builtin, - cmp-builtin, - coerce-builtin, - execfile-builtin, - file-builtin, - long-builtin, - raw_input-builtin, - reduce-builtin, - standarderror-builtin, - unicode-builtin, - xrange-builtin, - coerce-method, - delslice-method, - getslice-method, - setslice-method, - no-absolute-import, - old-division, - dict-iter-method, - dict-view-method, - next-method-called, - metaclass-assignment, - indexing-exception, - raising-string, - reload-builtin, - oct-method, - hex-method, - nonzero-method, - cmp-method, - input-builtin, - round-builtin, - intern-builtin, - unichr-builtin, - map-builtin-not-iterating, - zip-builtin-not-iterating, - range-builtin-not-iterating, - filter-builtin-not-iterating, - using-cmp-argument, - eq-without-hash, - div-method, - idiv-method, - rdiv-method, - exception-message-attribute, - invalid-str-codec, - sys-max-int, - bad-python3-import, - deprecated-string-function, - deprecated-str-translate-call, - deprecated-itertools-function, - deprecated-types-field, - next-method-defined, - dict-items-not-iterating, - dict-keys-not-iterating, - dict-values-not-iterating, - deprecated-operator-function, - deprecated-urllib-function, - xreadlines-attribute, - deprecated-sys-function, - exception-escape, - comprehension-escape, - missing-docstring, - no-member, - bare-except, - relative-import, - too-few-public-methods, - fixme, - broad-except, - # add these back in over time - invalid-name, - logging-format-interpolation, - too-many-lines, - too-many-arguments, - too-many-branches, - too-many-statements, - too-many-public-methods - - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -enable=c-extension-no-member - - -[REPORTS] - -# Python expression which should return a note less than 10 (10 is the highest -# note). You have access to the variables errors warning, statement which -# respectively contain the number of errors / warnings messages and the total -# number of statements analyzed. This is used by the global evaluation report -# (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details -#msg-template= - -# Set the output format. Available formats are text, parseable, colorized, json -# and msvs (visual studio).You can also give a reporter class, eg -# mypackage.mymodule.MyReporterClass. -output-format=colorized - -# Tells whether to display a full report or only the messages -reports=no - -# Activate the evaluation score. -score=yes - - -[REFACTORING] - -# Maximum number of nested blocks for function / method body -max-nested-blocks=5 - -# Complete name of functions that never returns. When checking for -# inconsistent-return-statements if a never returning function is called then -# it will be considered as an explicit return statement and no message will be -# printed. -never-returning-functions=optparse.Values,sys.exit - - -[LOGGING] - -# Logging modules to check that the string format arguments are in logging -# function parameter format -logging-modules=logging - - -[SPELLING] - -# Limits count of emitted suggestions for spelling mistakes -max-spelling-suggestions=4 - -# Spelling dictionary name. Available dictionaries: none. To make it working -# install python-enchant package. -spelling-dict= - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# A path to a file that contains private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to indicated private dictionary in -# --spelling-private-dict-file option instead of raising a message. -spelling-store-unknown-words=no - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=FIXME, - XXX, - TODO - - -[SIMILARITIES] - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no - -# Minimum lines number of a similarity. -min-similarity-lines=20 - - -[TYPECHECK] - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members= - -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - -# This flag controls whether pylint should warn about no-member and similar -# checks whenever an opaque object is returned when inferring. The inference -# can return multiple potential results while evaluating a Python object, but -# some branches might not be evaluated, which results in partial inference. In -# that case, it might be useful to still emit no-member and other checks for -# the rest of the inferred objects. -ignore-on-opaque-inference=yes - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis. It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= - - -# Show a hint with possible names when a member name was not found. The aspect -# of finding the hint is based on edit distance. -missing-member-hint=yes - -# The minimum edit distance a name should have in order to be considered a -# similar match for a missing member name. -missing-member-hint-distance=1 - -# The total number of similar names that should be taken in consideration when -# showing a hint for a missing member. -missing-member-max-choices=1 - - -[VARIABLES] - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid to define new builtins when possible. -additional-builtins= - -# Tells whether unused global variables should be treated as a violation. -allow-global-unused-variables=yes - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_, - _cb - -# A regular expression matching the name of dummy variables (i.e. expectedly -# not used). -dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ - -# Argument names that match this expression will be ignored. Default to name -# with leading underscore -ignored-argument-names=_.*|^ignored_|^unused_ - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six.moves,past.builtins,future.builtins,io,builtins - - -[FORMAT] - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string=' ' - -# Maximum number of characters on a single line. -max-line-length=120 - -# Maximum number of lines in a module -max-module-lines=1000 - -# List of optional constructs for which whitespace checking is disabled. `dict- -# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. -# `trailing-comma` allows a space between comma and closing bracket: (a, ). -# `empty-line` allows space-only lines. -no-space-check=trailing-comma, - dict-separator - -# Allow the body of a class to be on the same line as the declaration if body -# contains single statement. -single-line-class-stmt=no - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no - - -[BASIC] - -# Naming style matching correct argument names -argument-naming-style=snake_case - -# Regular expression matching correct argument names. Overrides argument- -# naming-style -#argument-rgx= - -# Naming style matching correct attribute names -attr-naming-style=snake_case - -# Regular expression matching correct attribute names. Overrides attr-naming- -# style -#attr-rgx= - -# Bad variable names which should always be refused, separated by a comma -bad-names=foo, - bar, - baz, - toto, - tutu, - tata - -# Naming style matching correct class attribute names -class-attribute-naming-style=any - -# Regular expression matching correct class attribute names. Overrides class- -# attribute-naming-style -#class-attribute-rgx= - -# Naming style matching correct class names -class-naming-style=PascalCase - -# Regular expression matching correct class names. Overrides class-naming-style -#class-rgx= - -# Naming style matching correct constant names -const-naming-style=UPPER_CASE - -# Regular expression matching correct constant names. Overrides const-naming- -# style -#const-rgx= - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=-1 - -# Naming style matching correct function names -function-naming-style=snake_case - -# Regular expression matching correct function names. Overrides function- -# naming-style -#function-rgx= - -# Good variable names which should always be accepted, separated by a comma -good-names=i, - j, - k, - ex, - Run, - _ - -# Include a hint for the correct naming format with invalid-name -include-naming-hint=no - -# Naming style matching correct inline iteration names -inlinevar-naming-style=any - -# Regular expression matching correct inline iteration names. Overrides -# inlinevar-naming-style -#inlinevar-rgx= - -# Naming style matching correct method names -method-naming-style=snake_case - -# Regular expression matching correct method names. Overrides method-naming- -# style -#method-rgx= - -# Naming style matching correct module names -module-naming-style=snake_case - -# Regular expression matching correct module names. Overrides module-naming- -# style -#module-rgx= - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=^_ - -# List of decorators that produce properties, such as abc.abstractproperty. Add -# to this list to register other decorators that produce valid properties. -property-classes=abc.abstractproperty - -# Naming style matching correct variable names -variable-naming-style=snake_case - -# Regular expression matching correct variable names. Overrides variable- -# naming-style -#variable-rgx= - - -[DESIGN] - -# Maximum number of arguments for function / method -max-args=5 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Maximum number of boolean expressions in a if statement -max-bool-expr=5 - -# Maximum number of branch for function / method body -max-branches=12 - -# Maximum number of locals for function / method body -max-locals=20 - -# Maximum number of parents for a class (see R0901). -max-parents=7 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - -# Maximum number of return / yield for function / method body -max-returns=6 - -# Maximum number of statements in function / method body -max-statements=50 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=2 - - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__, - __new__, - setUp - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict, - _fields, - _replace, - _source, - _make - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=mcs - - -[IMPORTS] - -# Allow wildcard imports from modules that define __all__. -allow-wildcard-with-all=no - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - -# Deprecated modules which should not be used, separated by a comma -deprecated-modules=regsub, - TERMIOS, - Bastion, - rexec - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled) -ext-import-graph= - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled) -import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled) -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "Exception" -overgeneral-exceptions=Exception \ No newline at end of file diff --git a/Makefile b/Makefile deleted file mode 100644 index 09642c0..0000000 --- a/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -PYLINT := env PYTHONPATH=$(PYTHONPATH) pylint - -install: - pip install -e .[dev] - -lint: - $(PYLINT) mediacloud - -test: - pytest - -build-release: - find . -name '.DS_Store' -type f -delete - python setup.py sdist - -release-test: - twine upload --repository-url https://test.pypi.org/legacy/ dist/* - -release: - twine upload dist/* diff --git a/README.md b/README.md index d99e8f1..d2bed32 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ MediaCloud Python API Client ============================ -🚧 Under construction 🚧 +🚧 Under construction 🚧 -This is a python client for accessing the MediaCloud API v4. This allows you to perform cross-platform searches and +This is a python client for accessing the MediaCloud API v4. This allows you to perform cross-platform searches and also browse our collection/source/feed directory. ![pylint](https://github.com/mediacloud/api-client/actions/workflows/pylint.yml/badge.svg) ![pytest](https://github.com/mediacloud/api-client/actions/workflows/pytest.yml/badge.svg) [![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/mitmedialab/MediaCloud-API-Client/blob/master/LICENSE) @@ -22,13 +22,46 @@ Check `CHANGELOG.md` for a detailed history of changes. Take a look at the test in the `mediacloud/test/` module for more detailed examples. +#### Count Stories Matching a Query + +```python +import mediacloud.api +US_NATIONAL_COLLECTION = 34412234 +mc_search = mediacloud.api.SearchAPI(YOUR_MC_API_KEY) +all_stories = [] +pagination_token = None +more_stories = True +while more_stories: + page, pagination_token = mc_search.story_list('robots', start_date= , end_date= collection_ids=[US_NATIONAL_COLLECTION]) + all_stories += page + more_stories = pagination_token is not None +print(f"Retrived {len(all_stories)} matching stories") +``` + +#### Page Through Stories Matching a Query + +```python +import mediacloud.api +INDIA_NATIONAL_COLLECTION = 34412118 +mc_search = mediacloud.api.SearchAPI(YOUR_MC_API_KEY) +all_stories = [] +pagination_token = None +more_stories = True +while more_stories: + page, pagination_token = mc_search.story_list('modi AND biden', collection_ids=[INDIA_NATIONAL_COLLECTION], + pagination_token=pagination_token) + all_stories += page + more_stories = pagination_token is not None +print(f"Retrived {len(all_stories)} matching stories") +``` + #### Fetch all Sources in a Collection ```python import mediacloud.api INDIA_NATIONAL_COLLECTION = 34412118 SOURCES_PER_PAGE = 100 # the number of sources retrieved per page -mc_directory = mediacloud.api.DirectoryApi(MC_API_KEY) +mc_directory = mediacloud.api.DirectoryApi(YOUR_MC_API_KEY) sources = [] offset = 0 # offset for paging through while True: @@ -51,20 +84,17 @@ If you are interested in adding code to this module, first clone [the GitHub rep ### Installing -`make install` +* `flit install` +* `pre-commit install` ### Testing -`make test` +`pytest` ### Distributing a New Version -If you want to, setup [twin's keyring integration](https://pypi.org/project/twine/) to avoid typing your PyPI -password over and over. - -1. Run `make test` to make sure all the test pass -2. Update the version number in `mediacloud/__init__.py` -3. Make a brief note in the CHANGELOG.md about what changes -4. Run `make build-release` to create an install package -5. Run `make release-test` to upload it to PyPI's test platform -6. Run `make release` to upload it to PyPI +1. Run `pytest` to make sure all the test pass +2. Update the version number in `pyproject.toml` +3. Make a brief note in the `CHANGELOG.md` about what changes +4. Run `flit build` to create an install package +5. Run `flit publish` to upload it to PyPI diff --git a/doc/python-versions.md b/doc/python-versions.md deleted file mode 100644 index 95b5a25..0000000 --- a/doc/python-versions.md +++ /dev/null @@ -1,40 +0,0 @@ -Python Version Management (OSX) -============================== - -Our dev codebase currently runs with Python v2.7.x on OSX. However, that is sure to change. Managing Python -versions can be hard, so this document introduce our approach. - -PyEnv ------ - -We manage different versions with [pyenv](https://github.com/pyenv/pyenv). Install this with HomeBrew: -``` -brew update -brew install pyenv -``` - -Then install the versions of Python we need: -``` -pyenv install 3.6.5 -``` - -PyEnv-VirtualEnv ----------------- - -For managing a virtual enviromnent with a specific version of python for our project, we use -[pyenv-virtualenv](https://github.com/pyenv/pyenv-virtualenv). Install this with homebrew as well -``` -brew install pyenv-virtualenv -``` -As noted in their readme, you'll need to add these two lines to your `.bash_profile` file (or you `.profile` file). Then open a new terminal session: -``` -eval "$(pyenv init -)" -eval "$(pyenv virtualenv-init -)" -``` - -And then create a virtualenv for this project. The name is important, because the `.python-version` file -refers to it so it loads autoamtically when you enter the directory (if `eval "$(pyenv virtualenv-init -)"` -is in your `.profile`): -``` -pyenv virtualenv 3.6.5 mc-api-client -``` diff --git a/mediacloud/__init__.py b/mediacloud/__init__.py index 1a3bef5..e69de29 100755 --- a/mediacloud/__init__.py +++ b/mediacloud/__init__.py @@ -1 +0,0 @@ -__version__ = '4.0.1' diff --git a/mediacloud/api.py b/mediacloud/api.py index 122bcbe..75b26ae 100644 --- a/mediacloud/api.py +++ b/mediacloud/api.py @@ -1,11 +1,12 @@ -import logging import datetime as dt -from typing import Dict, Optional, Union +import logging +from typing import Any, Dict, List, Optional, Union + import requests + import mediacloud import mediacloud.error - logger = logging.getLogger(__name__) @@ -14,11 +15,11 @@ class BaseApi: # Default applied to all queries made to main server. You can alter this on # your instance if you want to bail out more quickly, or know you have longer # running queries - TIMEOUT_SECS = 30 + TIMEOUT_SECS = 60 BASE_API_URL = "https://search.mediacloud.org/api/" - def __init__(self, auth_token=str): + def __init__(self, auth_token: Optional[str] = None): if not auth_token: raise mediacloud.error.MCException("No api key set - nothing will work without this") # Specify the auth_token to use for all future requests @@ -26,6 +27,7 @@ def __init__(self, auth_token=str): # better performance to put all HTTP through this one object self._session = requests.Session() self._session.headers.update({'Authorization': f'Token {self._auth_token}'}) + self._session.headers.update({'Accept': 'application/json'}) def user_profile(self) -> Dict: # :return: basic info about the current user, including their roles @@ -38,7 +40,7 @@ def version(self) -> Dict: """ return self._query('version') - def _query(self, endpoint: str, params: Dict = None, method: str = 'GET'): + def _query(self, endpoint: str, params: Optional[Dict] = None, method: str = 'GET') -> Dict: """ Centralize making the actual queries here for easy maintenance and testing of HTTP comms """ @@ -62,8 +64,8 @@ class DirectoryApi(BaseApi): PLATFORM_REDDIT = "reddit" def collection_list(self, platform: Optional[str] = None, name: Optional[str] = None, - limit: Optional[int] = 0, offset: Optional[int] = 0): - params = dict(limit=limit, offset=offset) + limit: Optional[int] = 0, offset: Optional[int] = 0) -> Dict: + params: Dict[Any, Any] = dict(limit=limit, offset=offset) if name: params['name'] = name if platform: @@ -72,8 +74,8 @@ def collection_list(self, platform: Optional[str] = None, name: Optional[str] = def source_list(self, platform: Optional[str] = None, name: Optional[str] = None, collection_id: Optional[int] = None, - limit: Optional[int] = 0, offset: Optional[int] = 0): - params = dict(limit=limit, offset=offset) + limit: Optional[int] = 0, offset: Optional[int] = 0) -> Dict: + params: Dict[Any, Any] = dict(limit=limit, offset=offset) if collection_id: params['collection_id'] = collection_id if name: @@ -82,10 +84,11 @@ def source_list(self, platform: Optional[str] = None, name: Optional[str] = None params['platform'] = platform return self._query('sources/sources/', params) - def feed_list(self, source_id: Optional[int] = None, modified_since: Optional[Union[dt.datetime, int, float]] = None, + def feed_list(self, source_id: Optional[int] = None, + modified_since: Optional[Union[dt.datetime, int, float]] = None, modified_before: Optional[Union[dt.datetime, int, float]] = None, - limit: Optional[int] = 0, offset: Optional[int] = 0): - params = dict(limit=limit, offset=offset) + limit: Optional[int] = 0, offset: Optional[int] = 0) -> Dict: + params: Dict[Any, Any] = dict(limit=limit, offset=offset) if source_id: params['source_id'] = source_id @@ -93,7 +96,7 @@ def epoch_param(t, param): if t is None: return # parameter not set if isinstance(t, dt.datetime): - params[param] = t.timestamp() # get epoch time + params[param] = t.timestamp() # get epoch time elif isinstance(t, (int, float)): params[param] = t else: @@ -103,3 +106,85 @@ def epoch_param(t, param): epoch_param(modified_before, 'modified_before') return self._query('sources/feeds/', params) + + +class SearchApi(BaseApi): + PROVIDER = "onlinenews-mediacloud" + + def _prep_default_params(self, query: str, start_date: dt.date, end_date: dt.date, + collection_ids: Optional[List[int]] = [], source_ids: Optional[List[int]] = [], + platform: Optional[str] = None): + params: Dict[Any, Any] = dict(start=start_date.isoformat(), end=end_date.isoformat(), q=query, + platform=self.PROVIDER) + if len(source_ids): + params['ss'] = ",".join([str(sid) for sid in source_ids]), + if len(collection_ids): + params['cs'] = ",".join([str(cid) for cid in collection_ids]), + return params + + def story_count(self, query: str, start_date: dt.date, end_date: dt.date, collection_ids: Optional[List[int]] = [], + source_ids: Optional[List[int]] = [], platform: Optional[str] = None) -> Dict: + params = self._prep_default_params(query, start_date, end_date, collection_ids, source_ids, platform) + results = self._query('search/total-count', params) + return results['count'] + + def story_count_over_time(self, query: str, start_date: dt.date, end_date: dt.date, + collection_ids: Optional[List[int]] = [], source_ids: Optional[List[int]] = [], + platform: Optional[str] = None) -> List[Dict]: + params = self._prep_default_params(query, start_date, end_date, collection_ids, source_ids, platform) + results = self._query('search/count-over-time', params) + return results['count_over_time']['counts'] + + def story_list(self, query: str, start_date: dt.date, end_date: dt.date, collection_ids: Optional[List[int]] = [], + source_ids: Optional[List[int]] = [], platform: Optional[str] = None, + expanded: Optional[bool] = None, pagination_token: Optional[str] = None, + sort_field: Optional[str] = None, sort_order: Optional[str] = None, + page_size: Optional[int] = None) -> tuple[Dict, Optional[str]]: + params = self._prep_default_params(query, start_date, end_date, collection_ids, source_ids, platform) + if expanded: + params['expanded'] = 1 + if pagination_token: + params['pagination_token'] = pagination_token + if sort_field: + params['sort_field'] = sort_field + if sort_order: + params['sort_order'] = sort_order + if page_size: + params['page_size'] = page_size + results = self._query('search/story-list', params) + for s in results['stories']: + s['publish_date'] = dt.date.fromisoformat(s['publish_date'][:10]) if s['publish_date'] else None + s['indexed_date'] = dt.date.fromisoformat(s['indexed_date'][:10]) if s['indexed_date'] else None + return results['stories'], results['pagination_token'] + + def story(self, story_id: str) -> Dict: + params = dict(storyId=story_id, platform=self.PROVIDER) + results = self._query('search/story', params) + return results['story'] + + def words(self, query: str, start_date: dt.date, end_date: dt.date, collection_ids: Optional[List[int]] = [], + source_ids: Optional[List[int]] = [], platform: Optional[str] = None, + limit: Optional[int] = None) -> List[Dict]: + params = self._prep_default_params(query, start_date, end_date, collection_ids, source_ids, platform) + if limit: + params['limit'] = limit + results = self._query('search/words', params) + return results['words'] + + def sources(self, query: str, start_date: dt.date, end_date: dt.date, collection_ids: Optional[List[int]] = [], + source_ids: Optional[List[int]] = [], platform: Optional[str] = None, + limit: Optional[int] = None) -> List[Dict]: + params = self._prep_default_params(query, start_date, end_date, collection_ids, source_ids, platform) + if limit: + params['limit'] = limit + results = self._query('search/sources', params) + return results['sources'] + + def languages(self, query: str, start_date: dt.date, end_date: dt.date, collection_ids: Optional[List[int]] = [], + source_ids: Optional[List[int]] = [], platform: Optional[str] = None, + limit: Optional[int] = None) -> List[Dict]: + params = self._prep_default_params(query, start_date, end_date, collection_ids, source_ids, platform) + if limit: + params['limit'] = limit + results = self._query('search/languages', params) + return results['languages'] diff --git a/mediacloud/test/api_base_test.py b/mediacloud/test/api_base_test.py new file mode 100644 index 0000000..a7a74bf --- /dev/null +++ b/mediacloud/test/api_base_test.py @@ -0,0 +1,27 @@ +import os +from unittest import TestCase + +import mediacloud.api +from mediacloud.error import MCException + + +class BaseApiTest(TestCase): + + @staticmethod + def test_no_token(): + try: + _ = mediacloud.api.DirectoryApi() + assert False + except MCException: + assert True + try: + _ = mediacloud.api.DirectoryApi("") + assert False + except MCException: + assert True + + @staticmethod + def test_token(): + mc_api_key = os.getenv("MC_API_TOKEN") + _ = mediacloud.api.DirectoryApi(mc_api_key) + assert True diff --git a/mediacloud/test/api_directory_test.py b/mediacloud/test/api_directory_test.py index 03deda8..ee0c555 100644 --- a/mediacloud/test/api_directory_test.py +++ b/mediacloud/test/api_directory_test.py @@ -1,6 +1,7 @@ -from unittest import TestCase import datetime as dt import os +from unittest import TestCase + import mediacloud.api TEST_COLLECTION_ID = 34412234 # US -National sources @@ -8,7 +9,7 @@ TEST_FEED_ID = 1 -class ApiDirectoryTest(TestCase): +class DirectoryTest(TestCase): def setUp(self): self._mc_api_key = os.getenv("MC_API_TOKEN") diff --git a/mediacloud/test/api_search_test.py b/mediacloud/test/api_search_test.py new file mode 100644 index 0000000..76c8291 --- /dev/null +++ b/mediacloud/test/api_search_test.py @@ -0,0 +1,156 @@ +import datetime as dt +import os +from unittest import TestCase + +import mediacloud.api + +COLLECTION_US_NATIONAL = 34412234 +AU_BROADCAST_COMPANY = 20775 +TOMORROW = dt.date.today() + dt.timedelta(days=1) + + +class DirectoryTest(TestCase): + + START_DATE = dt.date(2023, 11, 1) + END_DATE = dt.date(2023, 12, 1) + + def setUp(self): + self._mc_api_key = os.getenv("MC_API_TOKEN") + self._search = mediacloud.api.SearchApi(self._mc_api_key) + + def test_story_count(self): + results = self._search.story_count(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + collection_ids=[COLLECTION_US_NATIONAL], source_ids=[AU_BROADCAST_COMPANY]) + assert 'relevant' in results + assert results['relevant'] > 0 + assert 'total' in results + assert results['total'] > 0 + assert results['relevant'] <= results['total'] + + def test_story_count_over_time(self): + results = self._search.story_count_over_time(query="weather", start_date=self.START_DATE, + end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(results) == (self.END_DATE - self.START_DATE).days + 1 + for day in results: + assert 'date' in day + assert 'count' in day + assert 'total_count' in day + assert day['count'] <= day['total_count'] + assert 'ratio' in day + assert day['ratio'] < 1 + + def test_story(self): + # Note: Expected to fail right now + story_id = 'eebfb686618e34a9bc6e87e87e90c54b' # not sure this is a valid id (got it by md5 hashing staging-news-search-query.tarbell.mediacloud.org) + story = self._search.story(story_id) + assert 'id' in story + assert story['id'] == story_id + assert 'title' in story + assert 'url' in story + assert 'language' in story + assert 'publish_date' in story + assert 'publish_day' in story + + def test_words(self): + # expected to fail for now + results = self._search.words(query="weather", start_date=self.START_DATE, + end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL], + limit=10) + assert len(results) > 0 + + def test_sources(self): + results = self._search.sources(query="weather", start_date=self.START_DATE, + end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(results) > 0 + last_count = 10000000000 + for s in results: + assert 'source' in s + assert 'count' in s + assert s['count'] > 0 + assert s['count'] <= last_count + last_count = s['count'] + + def test_languages(self): + results = self._search.languages(query="weather", start_date=self.START_DATE, + end_date=self.END_DATE, collection_ids=[COLLECTION_US_NATIONAL]) + assert len(results) > 0 + assert results[0]['language'] == 'en' + last_ratio = 1 + for lang in results: + assert 'language' in lang + assert len(lang['language']) == 2 + assert 'ratio' in lang + assert lang['ratio'] < 1 + assert lang['ratio'] <= last_ratio + last_ratio = lang['ratio'] + assert 'value' in lang + assert lang['value'] > 0 + + def test_story_list_paging(self): + results1, next_page_token1 = self._search.story_list(query="weather", start_date=self.START_DATE, + end_date=self.END_DATE, + collection_ids=[COLLECTION_US_NATIONAL]) + assert len(results1) == 1000 + assert next_page_token1 is not None + results2, next_page_token2 = self._search.story_list(query="weather", start_date=self.START_DATE, + end_date=self.END_DATE, + collection_ids=[COLLECTION_US_NATIONAL], + pagination_token=next_page_token1) + assert len(results2) == 1000 + assert next_page_token2 is not None + assert next_page_token1 != next_page_token2 + + def test_story_list_expanded(self): + # note - requires staff API token + page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + collection_ids=[COLLECTION_US_NATIONAL]) + for story in page: + assert 'text' not in story + page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + expanded=True, collection_ids=[COLLECTION_US_NATIONAL]) + for story in page: + assert 'text' in story + assert len(story['text']) > 0 + + def test_story_list_sort_order(self): + # desc + page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + collection_ids=[COLLECTION_US_NATIONAL]) + last_pub_date = TOMORROW + for story in page: + assert 'publish_date' in story + assert story['publish_date'] <= last_pub_date + last_pub_date = story['publish_date'] + # asc + page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + collection_ids=[COLLECTION_US_NATIONAL], sort_order='asc') + a_long_time_ago = dt.date(2000, 1, 1) + last_pub_date = a_long_time_ago + for story in page: + assert 'publish_date' in story + assert story['publish_date'] >= last_pub_date + last_pub_date = story['publish_date'] + + def test_story_list_sort_field(self): + # publish_date + page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + collection_ids=[COLLECTION_US_NATIONAL]) + last_date = TOMORROW + for story in page: + assert 'publish_date' in story + assert story['publish_date'] <= last_date + last_date = story['publish_date'] + # indexed date + page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + collection_ids=[COLLECTION_US_NATIONAL], sort_field="indexed_date") + last_date = TOMORROW + for story in page: + assert 'indexed_date' in story + assert story['indexed_date'] <= last_date + last_date = story['indexed_date'] + + def test_story_list_page_size(self): + # test valid number + page, _ = self._search.story_list(query="weather", start_date=self.START_DATE, end_date=self.END_DATE, + collection_ids=[COLLECTION_US_NATIONAL], page_size=103) + assert len(page) == 103 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c82b2eb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,36 @@ +[build-system] +requires = ["flit_core >=3.2,<4"] +build-backend = "flit_core.buildapi" + +[project] +name = "mediacloud" +version = "4.1.0" +authors = [ + {name = "Rahul Bhargava", email = "r.bhargava@northeastern.edu"}, +] +description = "Media Cloud API Client Library" +readme = "README.md" +requires-python = ">=3.10" +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", +] +dependencies = [ + "requests == 2.*", +] + +[project.optional-dependencies] +dev = [ + "pre-commit", "flake8", "mypy", "isort", "types-urllib3", "types-requests", "python-dotenv" +] +test = [ + "pytest" +] + +[project.urls] +"Homepage" = "https://mediacloud.org" +"Bug Tracker" = "https://github.com/mediacloud/api-client/issues" diff --git a/setup.py b/setup.py deleted file mode 100644 index 8903c99..0000000 --- a/setup.py +++ /dev/null @@ -1,34 +0,0 @@ -#! /usr/bin/env python -import re -from os import path -from setuptools import setup - -REQUIRED_PACKAGES = [ - # utilities - "requests==2.*", # widely used HTTP library -] - -with open('mediacloud/__init__.py', 'r', encoding="utf-8") as fd: - version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', fd.read(), re.MULTILINE).group(1) - -# add README.md to distribution -this_directory = path.abspath(path.dirname(__file__)) -with open(path.join(this_directory, 'README.md'), encoding="utf-8") as f: - long_description = f.read() - -setup(name='mediacloud', - maintainer='Rahul Bhargava', - maintainer_email='r.bhargava@northeastern.edu', - version=version, - description='Media Cloud API Client Library', - long_description=long_description, - long_description_content_type='text/markdown', - url='http://mediacloud.org', - test_suite="mediacloud.test", - packages=['mediacloud'], - package_data={'': ['LICENSE']}, - python_requires='>3.7', - install_requires=REQUIRED_PACKAGES, - extras_require={'dev': ['pytest', 'pylint', 'twine', 'wheel', 'keyring', 'python-dotenv']}, - license='MIT', - )