From cfa7e16b1335cb879fe480698f34ebd60d212d5c Mon Sep 17 00:00:00 2001 From: Chris Wage Date: Mon, 10 Aug 2015 17:45:35 -0500 Subject: [PATCH 1/3] patched to fix issue #25 https://github.com/ecprice/newsdiffs/issues/25 --- website/manage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/manage.py b/website/manage.py index 6fd85def..333e602f 100755 --- a/website/manage.py +++ b/website/manage.py @@ -11,7 +11,7 @@ if __name__ == "__main__": os.environ.setdefault("DJANGO_SETTINGS_MODULE", "website.settings") - sys.path.append(os.path.dirname(os.getcwd())) + sys.path.append(os.getcwd()) from django.core.management import execute_from_command_line execute_from_command_line(sys.argv) From 60bdb87b3c29d7f50c9a5b8489203c962062fbd3 Mon Sep 17 00:00:00 2001 From: Jon Williams Date: Wed, 16 Mar 2016 12:15:06 -0400 Subject: [PATCH 2/3] remove South from apps --- website/settings_dev.py | 1 - website/settings_main.py | 1 - 2 files changed, 2 deletions(-) diff --git a/website/settings_dev.py b/website/settings_dev.py index ea2d7d98..a62e8ab5 100644 --- a/website/settings_dev.py +++ b/website/settings_dev.py @@ -83,6 +83,5 @@ 'django.contrib.contenttypes', 'django.contrib.sessions', 'django.contrib.sites', - 'south', 'frontend', ) diff --git a/website/settings_main.py b/website/settings_main.py index a436069e..6bb048da 100644 --- a/website/settings_main.py +++ b/website/settings_main.py @@ -102,7 +102,6 @@ 'django.contrib.contenttypes', 'django.contrib.sessions', 'django.contrib.sites', - 'south', 'frontend', ) From 6c604b8a9c42b041aecf2710f0333ac23b388e65 Mon Sep 17 00:00:00 2001 From: Jon Williams Date: Wed, 16 Mar 2016 12:21:26 -0400 Subject: [PATCH 3/3] Cleanup commands in README; specify Django 1.8 dep --- README.md | 54 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index a63d82e0..68c40cb8 100644 --- a/README.md +++ b/README.md @@ -18,29 +18,33 @@ Requirements You need to have installed on your local machine * Git * Python 2.6 or later -* Django and other Python libraries +* Django (~1.8) and other Python libraries On a Debian- or Ubuntu-based system, it may suffice (untested) to run - $ sudo apt-get install git-core python-django python-django-south python-simplejson +``` +$ sudo apt-get install git-core python-django python-django-south python-simplejson +``` On Mac OS, the easiest way may be to install pip: http://www.pip-installer.org/en/latest/installing.html and then - $ pip install Django - +``` +$ pip install Django==1.8 +``` Initial setup ------------- - - $ python website/manage.py syncdb && python website/manage.py migrate - $ mkdir articles - +``` +$ python website/manage.py syncdb && python website/manage.py migrate && mkdir articles +``` Running NewsDiffs Locally ------------------------- Do the initial setup above. Then to start the webserver for testing: - $ python website/manage.py runserver +``` +$ python website/manage.py runserver +``` and visit http://localhost:8000/ @@ -51,19 +55,23 @@ Running the scraper Do the initial setup above. You will also need additional Python libraries; on a Debian- or Ubuntu-based system, it may suffice (untested) to run - $ sudo apt-get install python-bs4 python-beautifulsoup +``` +$ sudo apt-get install python-bs4 python-beautifulsoup +``` on a Mac, you will want something like - - $ pip install beautifulsoup4 - $ pip install beautifulsoup - $ pip install html5lib - +``` +$ pip install beautifulsoup4 +$ pip install beautifulsoup +$ pip install html5lib +``` Note that we need two versions of BeautifulSoup, both 3.2 and 4.0; some websites are parsed correctly in only one version. Then run - $ python website/manage.py scraper +``` +$ python website/manage.py scraper +``` This will populate the articles repository with a list of current news articles. This is a snapshot at a single time, so the website will @@ -77,9 +85,9 @@ overwritten each run) and errors to /tmp/newsdiffs/logging_errs (which is cumulative). To run the scraper every hour, run something like: - - $ while true; do python website/manage.py scraper; sleep 60m; done - +``` + $while true; do python website/manage.py scraper; sleep 60m; done +``` or make a cron job. Adding new sites to the scraper @@ -92,13 +100,13 @@ parsers/__init__.py . You need to subclass of BaseParser (in parsers/baseparser.py). Model it off the other parsers in that directory. You can test the parser with by running, e.g., - +``` $ python parsers/test_parser.py bbc.BBCParser - +``` which will output a list of URLs to track, and - +``` $ python parsers/test_parser.py bbc.BBCParser http://www.bbc.co.uk/news/uk-21649494 - +``` which will output the text that NewsDiffs would store. (2) Add the parser to 'parsers' in parsers/__init__.py