Skip to content

Commit

Permalink
Improve the Gene2Go parser memory efficiency
Browse files Browse the repository at this point in the history
Finish early if all the taxa we needed to parse were seen already.

Do not produce records for inactive taxa.
  • Loading branch information
arteymix committed Nov 17, 2023
1 parent 9d5fcc7 commit e28bdd0
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 40 deletions.
2 changes: 2 additions & 0 deletions docs/customization.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ GO terms, on the other hand, are obtained from Ontobee:
rdp.settings.cache.term-file=http://purl.obolibrary.org/obo/go.obo
```

gene2go associations will only be populated for active taxa (new in 1.5.8).

## Gene Tiers

Users' genes are categorized in tiers based on their familiarity and experience with the gene. This is explained in
Expand Down
11 changes: 8 additions & 3 deletions src/main/java/ubc/pavlab/rdp/services/GOServiceImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ public class GOServiceImpl implements GOService, InitializingBean {
ANCESTORS_CACHE_NAME = "ubc.pavlab.rdp.services.GOService.ancestors",
DESCENDANTS_CACHE_NAME = "ubc.pavlab.rdp.services.GOService.descendants";

@Autowired
private TaxonService taxonService;

@Autowired
private GeneOntologyTermInfoRepository goRepository;

Expand All @@ -58,9 +61,6 @@ public class GOServiceImpl implements GOService, InitializingBean {
@Autowired
private OBOParser oboParser;

@Autowired
private Gene2GoParser gene2GoParser;

@Autowired
private CacheManager cacheManager;

Expand Down Expand Up @@ -157,6 +157,11 @@ public void updateGoTerms() {

log.info( String.format( "Loading gene2go annotations from: %s.", cacheSettings.getAnnotationFile() ) );

Set<Integer> activeTaxa = taxonService.findByActiveTrue().stream()
.map( Taxon::getId )
.collect( Collectors.toSet() );
Gene2GoParser gene2GoParser = new Gene2GoParser( activeTaxa );

Collection<Gene2GoParser.Record> records;
try {
records = gene2GoParser.parse( new GZIPInputStream( cacheSettings.getAnnotationFile().getInputStream() ) );
Expand Down
101 changes: 65 additions & 36 deletions src/main/java/ubc/pavlab/rdp/util/Gene2GoParser.java
Original file line number Diff line number Diff line change
@@ -1,52 +1,52 @@
package ubc.pavlab.rdp.util;

import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.Value;
import lombok.extern.apachecommons.CommonsLog;
import org.apache.commons.lang3.ArrayUtils;
import org.springframework.stereotype.Component;
import org.apache.commons.lang3.time.StopWatch;

import java.io.*;
import java.text.MessageFormat;
import java.util.Collection;
import java.util.Objects;
import java.util.stream.Collectors;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.util.*;

/**
* Read in the Gene2Go file provided by NCBI.
* <p>
* Created by mjacobson on 17/01/18.
*/
@CommonsLog
@Component
public class Gene2GoParser {

private static final String TAXON_ID_FIELD = "#tax_id", GENE_ID_FIELD = "GeneID", GO_ID_FIELD = "GO_ID";
private static final String[] EXPECTED_FIELDS = { TAXON_ID_FIELD, GENE_ID_FIELD, GO_ID_FIELD };
private static final int
TAXON_ID_INDEX = ArrayUtils.indexOf( EXPECTED_FIELDS, TAXON_ID_FIELD ),
GENE_ID_INDEX = ArrayUtils.indexOf( EXPECTED_FIELDS, GENE_ID_FIELD ),
GO_ID_INDEX = ArrayUtils.indexOf( EXPECTED_FIELDS, GO_ID_FIELD );

@Data
@AllArgsConstructor
public static class Record {
private Integer taxonId;
private Integer geneId;
private String goId;
private final Set<Integer> retainedTaxa;

public static Record parseLine( String line, String[] headerFields, int lineNumber ) throws UncheckedParseException {
String[] values = line.split( "\t" );
if ( values.length < headerFields.length ) {
throw new UncheckedParseException( MessageFormat.format( "Unexpected number of parts in: {0}", line ), lineNumber );
}
try {
return new Record( Integer.valueOf( values[ArrayUtils.indexOf( headerFields, TAXON_ID_FIELD )] ),
Integer.valueOf( values[ArrayUtils.indexOf( headerFields, GENE_ID_FIELD )] ),
values[ArrayUtils.indexOf( headerFields, GO_ID_FIELD )] );
} catch ( NumberFormatException e ) {
throw new UncheckedParseException( MessageFormat.format( "Could not parse number for: {0}.", line ), lineNumber, e );
}
}
/**
* @param retainedTaxa a set of taxa to retain from the gene2go input, or null to ignore
*/
public Gene2GoParser( Set<Integer> retainedTaxa ) {
this.retainedTaxa = retainedTaxa;
}

@Value
@AllArgsConstructor(access = AccessLevel.PRIVATE)
public static class Record {
int taxonId;
int geneId;
String goId;
}

public Collection<Record> parse( InputStream input ) throws ParseException, IOException {
StopWatch timer = StopWatch.createStarted();
try ( LineNumberReader br = new LineNumberReader( new InputStreamReader( input ) ) ) {
String headerLine = br.readLine();

Expand All @@ -58,19 +58,48 @@ public Collection<Record> parse( InputStream input ) throws ParseException, IOEx

for ( String field : EXPECTED_FIELDS ) {
if ( !ArrayUtils.contains( headerFields, field ) ) {
throw new ParseException( MessageFormat.format( "Unexpected header line: {0}.", headerLine ), br.getLineNumber() );
throw new ParseException( String.format( "Unexpected header line: %s", headerLine ), br.getLineNumber() );
}
}

try {
return br.lines()
.map( line -> Record.parseLine( line, headerFields, br.getLineNumber() ) )
.collect( Collectors.toList() );
} catch ( UncheckedIOException ioe ) {
throw ioe.getCause();
} catch ( UncheckedParseException e ) {
throw e.getCause();
String line;
Set<Integer> seenTaxa = new HashSet<>();
List<Record> records = new ArrayList<>();
while ( ( line = br.readLine() ) != null ) {
Record r;
int lineNumber = br.getLineNumber();
int taxonId, geneId;
String goId;
String[] values = line.split( "\t" );
if ( values.length < headerFields.length ) {
throw new ParseException( String.format( "Unexpected number of parts in: %s", line ), lineNumber );
}
try {
taxonId = Integer.parseInt( values[TAXON_ID_INDEX] );
seenTaxa.add( taxonId );
if ( retainedTaxa != null && !retainedTaxa.contains( taxonId ) ) {
// we've seen all the taxa that we needed to, terminate
if ( seenTaxa.containsAll( retainedTaxa ) ) {
log.debug( "All taxa we needed were parsed, terminating early!" );
break;
}
continue;
} else {
geneId = Integer.parseInt( values[GENE_ID_INDEX] );
goId = values[GO_ID_INDEX];
r = new Record( taxonId, geneId, goId );
}
} catch ( NumberFormatException e ) {
throw new ParseException( String.format( "Could not parse number for: %s", line ), lineNumber, e );
} finally {
if ( ( lineNumber + 1 ) % 1000000 == 0 ) {
log.debug( String.format( "Parsed %d line from (%d line/s)",
lineNumber + 1, (int) ( 1000.0 * ( lineNumber + 1 ) / timer.getTime() ) ) );
}
}
records.add( r );
}
return records;
}
}
}
11 changes: 10 additions & 1 deletion src/test/java/ubc/pavlab/rdp/services/GOServiceImplTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import org.mockito.internal.util.collections.Sets;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.TestConfiguration;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.cache.CacheManager;
import org.springframework.cache.concurrent.ConcurrentMapCacheManager;
import org.springframework.context.annotation.Bean;
Expand All @@ -28,6 +29,9 @@

import static java.util.function.Function.identity;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import static org.mockito.internal.verification.VerificationModeFactory.times;
import static ubc.pavlab.rdp.util.TestUtils.*;

/**
Expand Down Expand Up @@ -58,7 +62,7 @@ public GOService goService() {

@Bean
public Gene2GoParser gene2GoParser() {
return new Gene2GoParser();
return new Gene2GoParser( null );
}

@Bean
Expand All @@ -80,6 +84,9 @@ public CacheManager cacheManager() {
@Autowired
private GOService goService;

@MockBean
public TaxonService taxonService;

private Taxon taxon;
private Map<Integer, GeneInfo> genes;
private Map<Integer, GeneOntologyTermInfo> terms;
Expand All @@ -95,6 +102,7 @@ public void setUp() {
// T2[G2] T3[G1] T5[G1]

taxon = createTaxon( 1 );
when( taxonService.findByActiveTrue() ).thenReturn( Collections.singleton( taxon ) );

genes = new HashMap<>();

Expand Down Expand Up @@ -456,5 +464,6 @@ public void getTerm_whenNullId_thenReturnNull() {
public void updateGoTerms() {
goService.updateGoTerms();
assertThat( goService.getTerm( "GO:0000001" ) ).isNotNull();
verify( taxonService, times( 2 ) ).findByActiveTrue();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package ubc.pavlab.rdp.util;

import org.junit.Test;
import org.springframework.core.io.UrlResource;

import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.zip.GZIPInputStream;

import static org.junit.Assume.assumeNoException;

public class Gene2GoParserIntegrationTest {

/**
* This test can be lengthy!
*/
@Test
public void parse_withOnlineFile_thenSucceeds() throws ParseException {
Gene2GoParser parser = new Gene2GoParser( Collections.singleton( 9606 ) );
try ( InputStream is = new GZIPInputStream( new UrlResource( "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz" ).getInputStream() ) ) {
parser.parse( is );
} catch ( IOException e ) {
assumeNoException( e );
}
}

@Test
public void parse_withOnlineFile_whenFileIsEmpty_thenSkipTheWholeFile() throws ParseException {
Gene2GoParser parser = new Gene2GoParser( Collections.emptySet() );
try ( InputStream is = new GZIPInputStream( new UrlResource( "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz" ).getInputStream() ) ) {
parser.parse( is );
} catch ( IOException e ) {
assumeNoException( e );
}
}
}

0 comments on commit e28bdd0

Please sign in to comment.