diff --git a/docs/customization.md b/docs/customization.md index 1eba2ac6..d8d6767f 100644 --- a/docs/customization.md +++ b/docs/customization.md @@ -40,6 +40,8 @@ GO terms, on the other hand, are obtained from Ontobee: rdp.settings.cache.term-file=http://purl.obolibrary.org/obo/go.obo ``` +gene2go associations will only be populated for active taxa (new in 1.5.8). + ## Gene Tiers Users' genes are categorized in tiers based on their familiarity and experience with the gene. This is explained in diff --git a/src/main/java/ubc/pavlab/rdp/services/GOServiceImpl.java b/src/main/java/ubc/pavlab/rdp/services/GOServiceImpl.java index fc519621..a1c27dd1 100644 --- a/src/main/java/ubc/pavlab/rdp/services/GOServiceImpl.java +++ b/src/main/java/ubc/pavlab/rdp/services/GOServiceImpl.java @@ -49,6 +49,9 @@ public class GOServiceImpl implements GOService, InitializingBean { ANCESTORS_CACHE_NAME = "ubc.pavlab.rdp.services.GOService.ancestors", DESCENDANTS_CACHE_NAME = "ubc.pavlab.rdp.services.GOService.descendants"; + @Autowired + private TaxonService taxonService; + @Autowired private GeneOntologyTermInfoRepository goRepository; @@ -58,9 +61,6 @@ public class GOServiceImpl implements GOService, InitializingBean { @Autowired private OBOParser oboParser; - @Autowired - private Gene2GoParser gene2GoParser; - @Autowired private CacheManager cacheManager; @@ -157,6 +157,11 @@ public void updateGoTerms() { log.info( String.format( "Loading gene2go annotations from: %s.", cacheSettings.getAnnotationFile() ) ); + Set activeTaxa = taxonService.findByActiveTrue().stream() + .map( Taxon::getId ) + .collect( Collectors.toSet() ); + Gene2GoParser gene2GoParser = new Gene2GoParser( activeTaxa ); + Collection records; try { records = gene2GoParser.parse( new GZIPInputStream( cacheSettings.getAnnotationFile().getInputStream() ) ); diff --git a/src/main/java/ubc/pavlab/rdp/util/Gene2GoParser.java b/src/main/java/ubc/pavlab/rdp/util/Gene2GoParser.java index 824ad795..afc24da7 100644 --- a/src/main/java/ubc/pavlab/rdp/util/Gene2GoParser.java +++ b/src/main/java/ubc/pavlab/rdp/util/Gene2GoParser.java @@ -1,16 +1,17 @@ package ubc.pavlab.rdp.util; +import lombok.AccessLevel; import lombok.AllArgsConstructor; -import lombok.Data; +import lombok.Value; import lombok.extern.apachecommons.CommonsLog; import org.apache.commons.lang3.ArrayUtils; -import org.springframework.stereotype.Component; +import org.apache.commons.lang3.time.StopWatch; -import java.io.*; -import java.text.MessageFormat; -import java.util.Collection; -import java.util.Objects; -import java.util.stream.Collectors; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.LineNumberReader; +import java.util.*; /** * Read in the Gene2Go file provided by NCBI. @@ -18,35 +19,34 @@ * Created by mjacobson on 17/01/18. */ @CommonsLog -@Component public class Gene2GoParser { private static final String TAXON_ID_FIELD = "#tax_id", GENE_ID_FIELD = "GeneID", GO_ID_FIELD = "GO_ID"; private static final String[] EXPECTED_FIELDS = { TAXON_ID_FIELD, GENE_ID_FIELD, GO_ID_FIELD }; + private static final int + TAXON_ID_INDEX = ArrayUtils.indexOf( EXPECTED_FIELDS, TAXON_ID_FIELD ), + GENE_ID_INDEX = ArrayUtils.indexOf( EXPECTED_FIELDS, GENE_ID_FIELD ), + GO_ID_INDEX = ArrayUtils.indexOf( EXPECTED_FIELDS, GO_ID_FIELD ); - @Data - @AllArgsConstructor - public static class Record { - private Integer taxonId; - private Integer geneId; - private String goId; + private final Set retainedTaxa; - public static Record parseLine( String line, String[] headerFields, int lineNumber ) throws UncheckedParseException { - String[] values = line.split( "\t" ); - if ( values.length < headerFields.length ) { - throw new UncheckedParseException( MessageFormat.format( "Unexpected number of parts in: {0}", line ), lineNumber ); - } - try { - return new Record( Integer.valueOf( values[ArrayUtils.indexOf( headerFields, TAXON_ID_FIELD )] ), - Integer.valueOf( values[ArrayUtils.indexOf( headerFields, GENE_ID_FIELD )] ), - values[ArrayUtils.indexOf( headerFields, GO_ID_FIELD )] ); - } catch ( NumberFormatException e ) { - throw new UncheckedParseException( MessageFormat.format( "Could not parse number for: {0}.", line ), lineNumber, e ); - } - } + /** + * @param retainedTaxa a set of taxa to retain from the gene2go input, or null to ignore + */ + public Gene2GoParser( Set retainedTaxa ) { + this.retainedTaxa = retainedTaxa; + } + + @Value + @AllArgsConstructor(access = AccessLevel.PRIVATE) + public static class Record { + int taxonId; + int geneId; + String goId; } public Collection parse( InputStream input ) throws ParseException, IOException { + StopWatch timer = StopWatch.createStarted(); try ( LineNumberReader br = new LineNumberReader( new InputStreamReader( input ) ) ) { String headerLine = br.readLine(); @@ -58,19 +58,48 @@ public Collection parse( InputStream input ) throws ParseException, IOEx for ( String field : EXPECTED_FIELDS ) { if ( !ArrayUtils.contains( headerFields, field ) ) { - throw new ParseException( MessageFormat.format( "Unexpected header line: {0}.", headerLine ), br.getLineNumber() ); + throw new ParseException( String.format( "Unexpected header line: %s", headerLine ), br.getLineNumber() ); } } - try { - return br.lines() - .map( line -> Record.parseLine( line, headerFields, br.getLineNumber() ) ) - .collect( Collectors.toList() ); - } catch ( UncheckedIOException ioe ) { - throw ioe.getCause(); - } catch ( UncheckedParseException e ) { - throw e.getCause(); + String line; + Set seenTaxa = new HashSet<>(); + List records = new ArrayList<>(); + while ( ( line = br.readLine() ) != null ) { + Record r; + int lineNumber = br.getLineNumber(); + int taxonId, geneId; + String goId; + String[] values = line.split( "\t" ); + if ( values.length < headerFields.length ) { + throw new ParseException( String.format( "Unexpected number of parts in: %s", line ), lineNumber ); + } + try { + taxonId = Integer.parseInt( values[TAXON_ID_INDEX] ); + seenTaxa.add( taxonId ); + if ( retainedTaxa != null && !retainedTaxa.contains( taxonId ) ) { + // we've seen all the taxa that we needed to, terminate + if ( seenTaxa.containsAll( retainedTaxa ) ) { + log.debug( "All taxa we needed were parsed, terminating early!" ); + break; + } + continue; + } else { + geneId = Integer.parseInt( values[GENE_ID_INDEX] ); + goId = values[GO_ID_INDEX]; + r = new Record( taxonId, geneId, goId ); + } + } catch ( NumberFormatException e ) { + throw new ParseException( String.format( "Could not parse number for: %s", line ), lineNumber, e ); + } finally { + if ( ( lineNumber + 1 ) % 1000000 == 0 ) { + log.debug( String.format( "Parsed %d line from (%d line/s)", + lineNumber + 1, (int) ( 1000.0 * ( lineNumber + 1 ) / timer.getTime() ) ) ); + } + } + records.add( r ); } + return records; } } } diff --git a/src/test/java/ubc/pavlab/rdp/services/GOServiceImplTest.java b/src/test/java/ubc/pavlab/rdp/services/GOServiceImplTest.java index 4479bf36..833d9969 100644 --- a/src/test/java/ubc/pavlab/rdp/services/GOServiceImplTest.java +++ b/src/test/java/ubc/pavlab/rdp/services/GOServiceImplTest.java @@ -7,6 +7,7 @@ import org.mockito.internal.util.collections.Sets; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.TestConfiguration; +import org.springframework.boot.test.mock.mockito.MockBean; import org.springframework.cache.CacheManager; import org.springframework.cache.concurrent.ConcurrentMapCacheManager; import org.springframework.context.annotation.Bean; @@ -28,6 +29,9 @@ import static java.util.function.Function.identity; import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; +import static org.mockito.internal.verification.VerificationModeFactory.times; import static ubc.pavlab.rdp.util.TestUtils.*; /** @@ -58,7 +62,7 @@ public GOService goService() { @Bean public Gene2GoParser gene2GoParser() { - return new Gene2GoParser(); + return new Gene2GoParser( null ); } @Bean @@ -80,6 +84,9 @@ public CacheManager cacheManager() { @Autowired private GOService goService; + @MockBean + public TaxonService taxonService; + private Taxon taxon; private Map genes; private Map terms; @@ -95,6 +102,7 @@ public void setUp() { // T2[G2] T3[G1] T5[G1] taxon = createTaxon( 1 ); + when( taxonService.findByActiveTrue() ).thenReturn( Collections.singleton( taxon ) ); genes = new HashMap<>(); @@ -456,5 +464,6 @@ public void getTerm_whenNullId_thenReturnNull() { public void updateGoTerms() { goService.updateGoTerms(); assertThat( goService.getTerm( "GO:0000001" ) ).isNotNull(); + verify( taxonService, times( 2 ) ).findByActiveTrue(); } } \ No newline at end of file diff --git a/src/test/java/ubc/pavlab/rdp/util/Gene2GoParserIntegrationTest.java b/src/test/java/ubc/pavlab/rdp/util/Gene2GoParserIntegrationTest.java new file mode 100644 index 00000000..ff8b6900 --- /dev/null +++ b/src/test/java/ubc/pavlab/rdp/util/Gene2GoParserIntegrationTest.java @@ -0,0 +1,37 @@ +package ubc.pavlab.rdp.util; + +import org.junit.Test; +import org.springframework.core.io.UrlResource; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.zip.GZIPInputStream; + +import static org.junit.Assume.assumeNoException; + +public class Gene2GoParserIntegrationTest { + + /** + * This test can be lengthy! + */ + @Test + public void parse_withOnlineFile_thenSucceeds() throws ParseException { + Gene2GoParser parser = new Gene2GoParser( Collections.singleton( 9606 ) ); + try ( InputStream is = new GZIPInputStream( new UrlResource( "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz" ).getInputStream() ) ) { + parser.parse( is ); + } catch ( IOException e ) { + assumeNoException( e ); + } + } + + @Test + public void parse_withOnlineFile_whenFileIsEmpty_thenSkipTheWholeFile() throws ParseException { + Gene2GoParser parser = new Gene2GoParser( Collections.emptySet() ); + try ( InputStream is = new GZIPInputStream( new UrlResource( "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz" ).getInputStream() ) ) { + parser.parse( is ); + } catch ( IOException e ) { + assumeNoException( e ); + } + } +} \ No newline at end of file