Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/parklab/refinery-platform
Browse files Browse the repository at this point in the history
…into develop
  • Loading branch information
jkmarx committed Aug 11, 2015
2 parents 2af149c + 13c5390 commit 2c57ec6
Show file tree
Hide file tree
Showing 15 changed files with 1,102 additions and 1,214 deletions.
137 changes: 60 additions & 77 deletions refinery/data_set_manager/management/commands/mage2isa_convert.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
import errno
import logging
import os
import re
import string
import sys
import urllib2

from django.conf import settings
from django.core.management import call_command
from django.core.management.base import BaseCommand

from celery.task.sets import TaskSet, subtask

from data_set_manager.models import Study
from data_set_manager.tasks import convert_to_isatab
from datetime import date, datetime, timedelta
from django.conf import settings
from django.core.management import call_command
from django.core.management.base import BaseCommand, CommandError
import os
import sys
import urllib2
import errno
import string
import re
import time
import os.path
import logging
import tempfile

# get module logger
logger = logging.getLogger(__name__)


class Command(BaseCommand):
help = "Fetches a list of ArrayExpress experiments and converts their"
help = "%s MAGE-TAB to \nISA-Tab based on the keywords entered.\n" % help
Expand All @@ -39,136 +39,119 @@ class Command(BaseCommand):
help = "%s species='homo sapiens AND mouse'\n\n" % help

def _create_dir(self, file_path):
"""
Name: create_dir
Description:
creates a directory if it needs to be created
"""creates a directory if it needs to be created
Parameters:
file_path: directory to create if necessary
file_path: directory to create if necessary
"""
try:
os.makedirs(file_path)
except OSError, e:
if e.errno != errno.EEXIST:
raise


def _make_query(self, args):
"""
Name: make_query
Description:
creates an ArrayExpress query string from the command line
arguments
"""creates an ArrayExpress query string from the command line
arguments
Parameters:
args: the command line arguments
args: the command line arguments
"""
query_string = ""
if args:
query_list = list()
for arg in args:
query_list.append(arg)

query_string = string.join(query_list, "&")
query_string = "%s%s" % (settings.AE_BASE_QUERY, query_string)
else:
query_string = "%sexptype=" % settings.AE_BASE_QUERY

return query_string


"""
Name: handle
Description:
main program; calls the parsing and insertion functions
"""
def handle(self, *args, **options):
"""main program; calls the parsing and insertion functions"""
logger.info("Logging from mage2isa_convert")
ae_query = self._make_query(args)

try:
os.makedirs(settings.CONVERSION_DIR)
except OSError, e:
if e.errno != errno.EEXIST:
raise

#find out when the last pull from ArrayExpress was
# find out when the last pull from ArrayExpress was
ae_file = os.path.join(settings.CONVERSION_DIR, 'arrayexpress_studies')
try:
t = os.path.getmtime(ae_file)
last_date_run = datetime.fromtimestamp(t).date()
except: #if file doesn't exist yet, then just make last_date_run today
except:
# if file doesn't exist yet, then just make last_date_run today
last_date_run = date.today()


logger.info("getting %s" % ae_query)
u = urllib2.urlopen(ae_query)

logger.info("writing to file %s" % ae_file)
logger.info("getting %s", ae_query)
u = urllib2.urlopen(ae_query)
logger.info("writing to file %s", ae_file)
# TODO: use context manager for file operations
f = open(ae_file, 'w')
#download in pieces to make sure you're never biting off too much
# download in pieces to make sure you're never biting off too much
block_sz = 8192
while True:
buffer = u.read(block_sz) #read block_sz bytes from url
buffer = u.read(block_sz) # read block_sz bytes from url
if not buffer:
break

f.write(buffer) #write what you read from url to file

f.write(buffer) # write what you read from url to file
f.close()

ae_accessions = list()
f = open(ae_file, 'r')
for line in f:
try:
#get date that study was updated; between "lastupdatedate" tags
# get date that study was updated; between "lastupdatedate"
# tags
updated = string.split(line, 'lastupdatedate>').pop(1)
updated = updated[:-2] #take off the </ connected to the date
# take off the </ connected to the date
updated = updated[:-2]
accessions = string.split(line, 'accession>')
for a in accessions: #many accessions, so search for right one
# many accessions, so search for right one
for a in accessions:
if re.search(r'^E-', a):
a = a[:-2] #take off the </ connected to the accession
#will only convert new studies
# take off the </ connected to the accession
a = a[:-2]
# will only convert new studies
if not Study.objects.filter(identifier=a):
ae_accessions.append(a)
else: #if updated recently, then convert also
#convert string to datetime.date object for comparison
update = datetime.strptime(updated, '%Y-%m-%d').date()
#if the study has been updated since the last time we
#did this, update the ISA-Tab
else:
# if updated recently, then convert also
# convert string to datetime.date object for
# comparison
update = datetime.strptime(
updated, '%Y-%m-%d').date()
# if the study has been updated since the last time
# we did this, update the ISA-Tab
if (update - last_date_run) > timedelta(days=-1):
ae_accessions.append(a)
except IndexError: #looking at line without interesting information
pass
except IndexError:
# looking at line without interesting information
pass
f.close()


"""create directories that zip archives will reside in"""
# create directories that zip archives will reside in
base_isa_dir = os.path.join(settings.ISA_TAB_DIR, 'isa')
base_preisa_dir = os.path.join(settings.ISA_TAB_DIR, 'pre_isa')

self._create_dir(base_isa_dir)
self._create_dir(base_preisa_dir)

"""create subtasks for converting now that you know what to convert"""
# create subtasks for converting now that you know what to convert
s_tasks = list()
for ae_accession in ae_accessions:
#print ae_accession
s_task = convert_to_isatab.subtask(args=(ae_accession,
"%s/%s" % (base_isa_dir, ae_accession),
base_preisa_dir))
s_task = convert_to_isatab.subtask(
args=(ae_accession, "%s/%s" % (base_isa_dir, ae_accession),
base_preisa_dir))
s_tasks.append(s_task)

"""dispatch the tasks and wait for everything to return"""
# dispatch the tasks and wait for everything to return
job = TaskSet(tasks=s_tasks)
result = job.apply_async()

for i in result.iterate():
print i
logger.info(i)
sys.stdout.flush()

#space-saving measure
# space-saving measure
os.remove(ae_file)
touch = open(ae_file, 'w')
touch.close()

call_command('process_arrayexpress_isatab', base_isa_dir, "base_pre_isa_dir=%s" % base_preisa_dir, "is_public=True")
call_command('process_arrayexpress_isatab', base_isa_dir,
"base_pre_isa_dir=%s" % base_preisa_dir, "is_public=True")
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ def handle(self, *args, **options):
except User.DoesNotExist:
raise CommandError("User '%s' does not exist" % args[0])
public_group = ExtendedGroup.objects.public_group()
#TODO: optimize retrieving user's data sets
# TODO: optimize retrieving user's data sets
for data_set in DataSet.objects.all():
if user == data_set.get_owner():
self.stdout.write("Making public data set '%s'" % data_set.name)
self.stdout.write(
"Making public data set '%s'" % data_set.name)
data_set.share(public_group)
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import process_isatab


class Command(process_isatab.Command):
help = "Takes the directory of an ISA-Tab file as input, parses, and"
help = "%s inputs it into the database\n" % help
Expand All @@ -8,13 +9,13 @@ class Command(process_isatab.Command):
help = "%s<base_pre_isatab_directory> is_public=True]\n" % help

def __init__(self, filename=None):
super( Command, self ).__init__()
super(Command, self).__init__()
self._username = "ArrayExpress"
self._additional_raw_data_file_extension = ".gz"

def handle(self, *args, **options):
def handle(self, *args, **options):
# insert username into argument list
list_args = list( args )
list_args.insert( 0, self._username )
args = tuple( list_args )
super( Command, self ).handle( *args, **options )
list_args = list(args)
list_args.insert(0, self._username)
args = tuple(list_args)
super(Command, self).handle(*args, **options)
Loading

0 comments on commit 2c57ec6

Please sign in to comment.