Skip to content

Commit

Permalink
housekeeping
Browse files Browse the repository at this point in the history
  • Loading branch information
sbittrich committed Aug 23, 2024
1 parent 25a93df commit 3714784
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 71 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# Statistics
Shared code to compute archive-wide statistics like the number of non-hydrogen atoms.
Shared code to compute archive-wide statistics, like the number of non-hydrogen atoms.

# Statistics
| Task | Description | Count |
| --- | --- | --- |
| Task | Description | Count |
|--------|-------------------|---------------|
| Task01 | Count Heavy Atoms | 2,168,013,406 |

Last updated: 08/21/24
Last updated: 08/23/24
Number of structures: 224,004
12 changes: 6 additions & 6 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,18 @@
<dependency>
<groupId>org.rcsb</groupId>
<artifactId>ciftools-java</artifactId>
<version>5.0.2</version>
<version>6.0.0</version>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>2.0.7</version>
<version>2.0.16</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.4.7</version>
<version>1.5.7</version>
</dependency>
</dependencies>

Expand All @@ -41,7 +41,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.10.1</version>
<version>3.13.0</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
Expand All @@ -51,13 +51,13 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<!-- JUnit 5 requires Surefire version 2.22.0 or higher -->
<version>3.0.0-M7</version>
<version>3.4.0</version>
</plugin>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.1.0</version>
<version>3.7.1</version>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptorRefs>
Expand Down
36 changes: 11 additions & 25 deletions src/main/java/org/rcsb/stats/Constants.java
Original file line number Diff line number Diff line change
@@ -1,30 +1,18 @@
package org.rcsb.stats;

/**
* Configuration and constants.
*/
public class Constants {
public static final String BCIF_SOURCE = "https://models.rcsb.org/%s.bcif.gz";
public static final String SEARCH_API_URL = "https://search.rcsb.org/rcsbsearch/v2/query?json=";
public static final String GET_ALL_EXPERIMENTAL_QUERY = """
{
"query": {
"type": "terminal",
"label": "text",
"service": "text",
"parameters": {
"attribute": "rcsb_entry_container_identifiers.entry_id",
"operator": "exists",
"negation": false
}
},
"return_type": "entry",
"request_options": {
"results_content_type": [
"experimental"
],
"return_all_hits": true,
"results_verbosity": "compact"
}
}""";
public static final String GET_ALL_CSM_QUERY = """

// distinguishes experimentally-determined structures and computed structure models
public enum ResultsContentType {
EXPERIMENTAL, COMPUTATIONAL;
}

public static final String GET_ALL_IDENTIFIERS_QUERY = """
{
"query": {
"type": "terminal",
Expand All @@ -38,9 +26,7 @@ public class Constants {
},
"return_type": "entry",
"request_options": {
"results_content_type": [
"computational"
],
"results_content_type": [%s],
"return_all_hits": true,
"results_verbosity": "compact"
}
Expand Down
73 changes: 46 additions & 27 deletions src/main/java/org/rcsb/stats/Helpers.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
import com.google.gson.JsonObject;
import org.rcsb.cif.CifIO;
import org.rcsb.cif.ParsingException;
import org.rcsb.cif.schema.StandardSchemata;
import org.rcsb.cif.schema.mm.MmCifFile;
import org.rcsb.cif.model.CifFile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -29,6 +28,9 @@
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
* Shared functionality.
*/
public class Helpers {
private static final Logger logger = LoggerFactory.getLogger(Helpers.class);

Expand All @@ -37,14 +39,14 @@ public class Helpers {
* @param identifiers set to operate on
* @return stream of structure data
*/
public static Stream<MmCifFile> fetchStructureData(Collection<String> identifiers) {
public static Stream<CifFile> fetchStructureData(Collection<String> identifiers) {
return identifiers.parallelStream()
.map(Helpers::fetchStructureData);
}

private static MmCifFile fetchStructureData(String identifier) {
private static CifFile fetchStructureData(String identifier) {
try {
return CifIO.readFromURL(new URL(String.format(Constants.BCIF_SOURCE, identifier))).as(StandardSchemata.MMCIF);
return CifIO.readFromURL(new URL(String.format(Constants.BCIF_SOURCE, identifier)));
} catch (IOException e) {
logger.warn("Failed to pull structure data for {}", identifier);
throw new UncheckedIOException(e);
Expand All @@ -56,13 +58,14 @@ private static MmCifFile fetchStructureData(String identifier) {

/**
* Get a list of all experimental IDs known to the production system.
* @param experimental get experimental or computational identifiers
* @param contentTypes flavor of identifiers to request
* @return collection of known entry IDs
* @throws IOException operation failed
*/
public static Set<String> getAllIdentifiers(boolean experimental) throws IOException {
URL url = getSearchUrl(experimental);
public static Set<String> getAllIdentifiers(Set<Constants.ResultsContentType> contentTypes) throws IOException {
URL url = getSearchUrl(contentTypes);
logger.info("Retrieving current entry list from RCSB PDB Search API at {}", url.toString().split("\\?")[0]);

Set<String> out = new HashSet<>();
try (InputStream inputStream = url.openStream()) {
JsonElement jsonElement = new Gson().fromJson(new InputStreamReader(inputStream), JsonElement.class);
Expand All @@ -71,12 +74,18 @@ public static Set<String> getAllIdentifiers(boolean experimental) throws IOExcep
jsonObject.getAsJsonArray("result_set")
.forEach(id -> out.add(id.getAsString()));
}

logger.info("There are {} entries", Helpers.formatNumber(out.size()));
return out;
}

private static URL getSearchUrl(boolean experimental) throws MalformedURLException {
String query = URLEncoder.encode(experimental ? Constants.GET_ALL_EXPERIMENTAL_QUERY : Constants.GET_ALL_CSM_QUERY, StandardCharsets.UTF_8);
private static URL getSearchUrl(Set<Constants.ResultsContentType> contentTypes) throws MalformedURLException {
String ct = contentTypes.stream()
.map(Constants.ResultsContentType::name)
.map(String::toLowerCase)
.map(t -> "\"" + t + "\"")
.collect(Collectors.joining(", "));
String query = URLEncoder.encode(String.format(Constants.GET_ALL_IDENTIFIERS_QUERY, ct), StandardCharsets.UTF_8);
return new URL(Constants.SEARCH_API_URL + query);
}

Expand Down Expand Up @@ -107,31 +116,41 @@ public static String formatNumber(double d) {
return String.format("%,.2f", d);
}

/**
* Write output of a file to the Markdown table in README.md
* @param task name of updater
* @param result obtained count
* @param structureCount number of evaluated entries
* @throws IOException things went wrong
*/
public static void updateCount(Class<?> task, long result, int structureCount) throws IOException {
Path path = Paths.get("README.md");
logger.info("Updating file at {}", path);
String taskTag = task.getSimpleName().split("_")[0];
String taskDescription = task.getSimpleName().split("_")[1];
String out = Files.lines(path)
.map(line -> {
if (line.startsWith("| ")) {
if (line.startsWith("| " + taskTag)) {
return "| " + taskTag + " | " + insertWhitespaceBeforeUpperCase(taskDescription) + " | " + formatNumber(result) + " |";
} else {
return line;
}
} else if (line.startsWith("Last updated")) {
LocalDate currentDate = LocalDate.now();
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("MM/dd/yy");
return line.split(": ")[0] + ": " + currentDate.format(formatter);
} else if (line.startsWith("Number of structures")) {
return line.split(": ")[0] + ": " + formatNumber(structureCount);

try (Stream<String> lines = Files.lines(path)) {
String out = lines.map(line -> {
if (line.startsWith("| ")) {
if (line.startsWith("| " + taskTag)) {
return "| " + taskTag + " | " + insertWhitespaceBeforeUpperCase(taskDescription) + " | " + formatNumber(result) + " |";
} else {
return line;
}
})
.collect(Collectors.joining(System.lineSeparator()));
Files.writeString(path, out);
} else if (line.startsWith("Last updated")) {
LocalDate currentDate = LocalDate.now();
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("MM/dd/yy");
return line.split(": ")[0] + ": " + currentDate.format(formatter);
} else if (line.startsWith("Number of structures")) {
return line.split(": ")[0] + ": " + formatNumber(structureCount);
} else {
return line;
}
})
.collect(Collectors.joining(System.lineSeparator()));

Files.writeString(path, out);
}
}

private static String insertWhitespaceBeforeUpperCase(String input) {
Expand Down
50 changes: 41 additions & 9 deletions src/main/java/org/rcsb/stats/tasks/Task01_CountHeavyAtoms.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package org.rcsb.stats.tasks;

import org.rcsb.cif.schema.mm.MmCifFile;
import org.rcsb.cif.model.CifFile;
import org.rcsb.cif.schema.StandardSchemata;
import org.rcsb.stats.Constants;
import org.rcsb.stats.Helpers;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -9,6 +11,9 @@
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

/**
* Obtain the number of non-hydrogen atoms as described by the `atom_site` content of a mmCIF file.
*/
public class Task01_CountHeavyAtoms {
private static final Logger logger = LoggerFactory.getLogger(Task01_CountHeavyAtoms.class);

Expand All @@ -17,29 +22,56 @@ public static void main(String[] args) throws IOException {
}

void computeStats() throws IOException {
Set<String> identifiers = Helpers.getAllIdentifiers(true);

// request set of all known identifiers
Set<String> identifiers = Helpers.getAllIdentifiers(Set.of(Constants.ResultsContentType.EXPERIMENTAL));
AtomicInteger counter = new AtomicInteger();

// obtain stream of CifFiles
long heavyAtomCount = Helpers.fetchStructureData(identifiers)
// log progress every 10,000 elements
.peek(i -> { if (counter.incrementAndGet() % 10000 == 0) logger.info("Processed {} entries", Helpers.formatNumber(counter.get())); })
.mapToLong(Task01_CountHeavyAtoms::countHeavyAtoms)
// transform structure into number of atoms
.mapToLong(this::countHeavyAtoms)
// aggregate as sum
.sum();

logger.info("There are {} heavy (non-hydrogen) atoms in {} PDB structures", heavyAtomCount, Helpers.formatNumber(counter.get()));

// write results back to table in README.md
Helpers.updateCount(this.getClass(), heavyAtomCount, counter.get());
}

private static long countHeavyAtoms(MmCifFile cifFile) {
return cifFile.getFirstBlock()
/**
* Process a CIF file.
* @param cifFile source data
* @return the count of non-hydrogen atoms
*/
long countHeavyAtoms(CifFile cifFile) {
return cifFile
// optionally, apply mmCIF schema to get schema definitions and types
.as(StandardSchemata.MMCIF)
// CIF files may have multiple blocks of data, the PDB archive only makes use of the 1st
.getFirstBlock()
// access `atom_site` category
.getAtomSite()
// access `atom_site.type_symbol` column
.getTypeSymbol()
// stream over all element names of all atoms
.values()
.filter(Task01_CountHeavyAtoms::isHeavyAtom)
// retain only non-hydrogen atoms
.filter(this::isHeavyAtom)
// represent as count of all elements matching the condition
.count();
}

private static final Set<String> HYDROGEN_ATOMS = Set.of("H", "D", "T");
private static boolean isHeavyAtom(String typeSymbol) {
final Set<String> HYDROGEN_ATOMS = Set.of("H", "D", "T");

/**
* Filters for non-hydrogen atoms based on their `atom_site.type_symbol`.
* @param typeSymbol element of this atom
* @return false if this is hydrogen
*/
boolean isHeavyAtom(String typeSymbol) {
return !HYDROGEN_ATOMS.contains(typeSymbol);
}
}

0 comments on commit 3714784

Please sign in to comment.