diff --git a/layer2/CifFile.cpp b/layer2/CifFile.cpp index 3e3a0234d..b53785b95 100644 --- a/layer2/CifFile.cpp +++ b/layer2/CifFile.cpp @@ -12,14 +12,20 @@ #include #include +#include #include #include +#include #include "CifFile.h" #include "File.h" #include "MemoryDebug.h" #include "strcasecmp.h" +#if !defined(_PYMOL_NO_MSGPACKC) +#include +#endif + namespace pymol { namespace _cif_detail { @@ -125,11 +131,18 @@ const char * cif_loop::get_value_raw(int row, int col) const { // get the number of elements in this array unsigned cif_array::size() const { - return (col == NOT_IN_LOOP) ? 1 : pointer.loop->nrows; + if (auto arr = std::get_if(&m_array)) { + return (arr->col == cif_detail::cif_str_array::NOT_IN_LOOP) + ? 1 + : arr->pointer.loop->nrows; + } else if (auto arr = std::get_if(&m_array)) { + return arr->m_arr.size(); + } + return 0; } /// Get array value, return nullptr if `pos >= size()` or value in ['.', '?'] -const char* cif_array::get_value_raw(unsigned pos) const +const char* cif_detail::cif_str_array::get_value_raw(unsigned pos) const { if (col == NOT_IN_LOOP) return (pos > 0) ? nullptr : pointer.value; @@ -157,41 +170,82 @@ bool cif_array::is_missing_all() const { * @param key data name, must be lower case */ const cif_array * cif_data::get_arr(const char * key) const { - const char* p = strchr(key, '?'); - decltype(m_dict)::const_iterator it; + if (auto data = std::get_if(&m_data)) { + const auto& dict = data->m_dict; + const char* p = strchr(key, '?'); + std::remove_reference_t::const_iterator it; #ifndef NDEBUG - for (const char* q = key; *q; ++q) { - assert("key must be lower case" && !('Z' >= *q && *q >= 'A')); - } + for (const char* q = key; *q; ++q) { + assert("key must be lower case" && !('Z' >= *q && *q >= 'A')); + } #endif - // support alias shortcut: '?' matches '.' and '_' - if (p != nullptr) { - std::string tmp(key); - // replace '?' by '.' or '_' - tmp[p - key] = '.'; - if ((it = m_dict.find(tmp.c_str())) != m_dict.end()) - return &it->second; - tmp[p - key] = '_'; - if ((it = m_dict.find(tmp.c_str())) != m_dict.end()) - return &it->second; - } else { - if ((it = m_dict.find(key)) != m_dict.end()) - return &it->second; + // support alias shortcut: '?' matches '.' and '_' + if (p != nullptr) { + std::string tmp(key); + // replace '?' by '.' or '_' + tmp[p - key] = '.'; + if ((it = dict.find(tmp.c_str())) != dict.end()) + return &it->second; + tmp[p - key] = '_'; + if ((it = dict.find(tmp.c_str())) != dict.end()) + return &it->second; + } else { + if ((it = dict.find(key)) != dict.end()) + return &it->second; + } + } else if (auto data = std::get_if(&m_data)) { + + const auto& dict = data->m_dict; + + std::string_view keyView(key); + auto split_key = [](const char c) { + return c == '.' /*|| c == '_'*/ || c == '?'; + }; + auto splitTokenIt = std::find_if(keyView.begin(), keyView.end(), split_key); + if (splitTokenIt == keyView.end()) { + return nullptr; + } + auto dist = std::distance(keyView.begin(), splitTokenIt); + auto categoryView = keyView.substr(0, dist); + auto categoryStr = std::string(categoryView); + auto categoryIt = dict.find(categoryStr.c_str()); + if (categoryIt == dict.end()) { + return nullptr; + } + auto& category = categoryIt->second; + auto columnView = keyView.substr(dist + 1); + auto columnStr = std::string(columnView); + auto columnIt = category.find(columnStr.c_str()); + if (columnIt == category.end()) { + return nullptr; + } + return &columnIt->second; } return nullptr; } +const char* cif_data::code() const +{ + if (auto data = std::get_if(&m_data)) { + return data->m_code ? data->m_code : ""; + } + return ""; +} + const cif_array* cif_data::empty_array() { return &EMPTY_ARRAY; } -const cif_data* cif_data::get_saveframe(const char* code) const { - auto it = m_saveframes.find(code); - if (it != m_saveframes.end()) - return &it->second; +const cif_detail::cif_str_data* cif_data::get_saveframe(const char* code) const { + if (auto data = std::get_if(&m_data)) { + const auto& saveframes = data->m_saveframes; + auto it = saveframes.find(code); + if (it != saveframes.end()) + return &it->second; + } return nullptr; } @@ -302,8 +356,8 @@ bool cif_file::parse(char*&& p) { } } - cif_data* current_frame = nullptr; - std::vector frame_stack; + cif_detail::cif_str_data* current_frame = nullptr; + std::vector frame_stack; std::unique_ptr global_block; decltype(m_datablocks) datablocksnew; @@ -324,7 +378,10 @@ bool cif_file::parse(char*&& p) { } tolowerinplace(tokens[i]); - current_frame->m_dict[tokens[i]].set_value(tokens[i + 1]); + current_frame->m_dict[tokens[i]].m_array = cif_detail::cif_str_array{}; + auto& cif_arr = std::get( + current_frame->m_dict[tokens[i]].m_array); + cif_arr.set_value(tokens[i + 1]); i++; } else if (strcasecmp("loop_", tokens[i]) == 0) { @@ -344,8 +401,10 @@ bool cif_file::parse(char*&& p) { // columns while (++i < n && keypossible[i] && tokens[i][0] == '_') { tolowerinplace(tokens[i]); - - current_frame->m_dict[tokens[i]].set_loop(loop, ncols); + current_frame->m_dict[tokens[i]].m_array = cif_detail::cif_str_array{}; + auto& cif_arr = std::get( + current_frame->m_dict[tokens[i]].m_array); + cif_arr.set_loop(loop, ncols); ncols++; } @@ -376,15 +435,18 @@ bool cif_file::parse(char*&& p) { i--; } else if (strncasecmp("data_", tokens[i], 5) == 0) { - datablocksnew.emplace_back(); - current_frame = &datablocksnew.back(); + auto& new_data = datablocksnew[tokens[i] + 5]; + new_data.m_data = cif_detail::cif_str_data(); + current_frame = &std::get(new_data.m_data); current_frame->m_code = tokens[i] + 5; frame_stack = {current_frame}; } else if (strncasecmp("global_", tokens[i], 5) == 0) { // STAR feature, not supported in CIF - current_frame = new cif_data; - global_block.reset(current_frame); + auto new_data = new cif_data; + new_data->m_data = cif_detail::cif_str_data{}; + current_frame = &std::get(new_data->m_data); + global_block.reset(new_data); frame_stack = {current_frame}; } else if (strncasecmp("save_", tokens[i], 5) == 0) { @@ -419,6 +481,308 @@ bool cif_file::parse(char*&& p) { return true; } + +#if !defined(_PYMOL_NO_MSGPACKC) +enum class DataTypes +{ + Int8 = 1, + Int16 = 2, + Int32 = 3, + UInt8 = 4, + UInt16 = 5, + UInt32 = 6, + Float32 = 32, + Float64 = 33, +}; + +template +void decodeAndPushBack(const std::vector& bytes, std::size_t& i, + std::size_t size, std::vector& result) +{ + T value; + std::memcpy(&value, &bytes[i], size); + result.push_back(value); +} + +static std::vector byte_array_decode(const std::vector& bytes, DataTypes dataType) +{ + std::vector result; + std::unordered_map dataTypeSize = { + {DataTypes::Int8, sizeof(std::int8_t)}, + {DataTypes::Int16, sizeof(std::int16_t)}, + {DataTypes::Int32, sizeof(std::int32_t)}, + {DataTypes::UInt8, sizeof(std::uint8_t)}, + {DataTypes::UInt16, sizeof(std::uint16_t)}, + {DataTypes::UInt32, sizeof(std::uint32_t)}, + {DataTypes::Float32, sizeof(float)}, + {DataTypes::Float64, sizeof(double)}, + }; + + auto size = dataTypeSize[dataType]; + for (std::size_t i = 0; i < bytes.size(); i += size) { + CifArrayElement valueVar; + switch (dataType) { + case DataTypes::Int8: + decodeAndPushBack(bytes, i, size, result); + break; + case DataTypes::Int16: + decodeAndPushBack(bytes, i, size, result); + break; + case DataTypes::Int32: + decodeAndPushBack(bytes, i, size, result); + break; + case DataTypes::UInt8: + decodeAndPushBack(bytes, i, size, result); + break; + case DataTypes::UInt16: + decodeAndPushBack(bytes, i, size, result); + break; + case DataTypes::UInt32: + decodeAndPushBack(bytes, i, size, result); + break; + case DataTypes::Float32: + decodeAndPushBack(bytes, i, size, result); + break; + case DataTypes::Float64: + decodeAndPushBack(bytes, i, size, result); + break; + } + } + return result; +} + +static std::vector integer_packing_decode( + const std::vector& packedInts, int byteCount, int srcSize, + bool isUnsigned) +{ + std::vector result(srcSize); + std::int32_t upperLimit; + if (isUnsigned) { + upperLimit = byteCount == 1 ? std::numeric_limits::max() + : std::numeric_limits::max(); + } else { + upperLimit = byteCount == 1 ? std::numeric_limits::max() + : std::numeric_limits::max(); + } + std::int32_t lowerLimit = -upperLimit - 1; + + auto as_int = [isUnsigned, byteCount](auto&& elem) -> std::int32_t { + if (isUnsigned) { + return byteCount == 1 ? static_cast(std::get(elem)) + : static_cast(std::get(elem)); + } else { + return byteCount == 1 ? static_cast(std::get(elem)) + : static_cast(std::get(elem)); + } + }; + + auto at_limit = [isUnsigned, upperLimit, lowerLimit](std::int32_t t) -> bool { + return isUnsigned ? (t == upperLimit) + : (t == upperLimit || t == lowerLimit); + }; + + for (int i = 0, j = 0; i < packedInts.size(); ++i, ++j) { + std::int32_t value = 0; + std::int32_t t = as_int(packedInts[i]); + while (at_limit(t)) { + value += t; + t = as_int(packedInts[++i]); + } + value += t; + result[j] = value; + } + return result; +} + +static std::vector delta_decode( + std::vector& data, std::int32_t origin, DataTypes srcType) +{ + std::vector result = data; + result[0] = origin; + auto add_int32_t = [](auto&& a, auto&& b) -> std::int32_t { + return std::get(a) + std::get(b); + }; + std::inclusive_scan(result.begin(), result.end(), result.begin(), add_int32_t); + return result; +} + +static std::vector run_length_decode( + std::vector& data, DataTypes srcType, int srcSize) +{ + std::vector result; + for (std::size_t i = 0; i < data.size(); i += 2) { + auto item = std::get(data[i]); + auto count = std::get(data[i + 1]); + for (std::int32_t j = 0; j < count; j++) { + result.push_back(item); + } + } + return result; +} + +static std::vector fixed_array_decode( + std::vector& data, int factor, DataTypes srcType) +{ + std::vector result = data; + auto div_int32_t = [factor, srcType](auto&& a) -> auto { + return srcType == DataTypes::Float32 + ? std::get(a) / static_cast(factor) + : std::get(a) / static_cast(factor); + }; + std::transform(data.begin(), data.end(), result.begin(), div_int32_t); + return result; +} + +static std::vector interval_quant_decode( + std::vector& data, double min, double max, int numSteps, + DataTypes srcType) +{ + std::vector result = data; + auto delta = (max - min) / (numSteps - 1); + std::transform(data.begin(), data.end(), result.begin(), + [min, delta](auto&& a) -> double { + return min + std::get(a) * delta; + }); + return result; +} + +static std::vector parse_bcif_decode( + const std::vector& rawData, + std::vector>& dataEncoding); + +static std::vector string_array_decode( + const std::vector& data, + std::vector>& indicesEncoding, + const std::string& stringData, const std::vector& offsets, + std::vector>& offsetEncoding) +{ + auto decodedOffsets = parse_bcif_decode(offsets, offsetEncoding); + auto indices = parse_bcif_decode(data, indicesEncoding); + + std::vector result; + result.reserve(indices.size()); + + std::vector strings = {""}; + strings.reserve(decodedOffsets.size()); + for (int i = 1; i < decodedOffsets.size(); i++) { + auto start = std::get(decodedOffsets[i - 1]); + auto end = std::get(decodedOffsets[i]); + auto str = stringData.substr(start, end - start); + strings.push_back(str); + } + + for (int i = 0; i < indices.size(); i++) { + auto index = std::get(indices[i]); + result.push_back(strings[index + 1]); + } + return result; +} + +static void parse_bcif_decode_kind(const std::string& kind, + const std::vector& rawData, + std::vector& result, + std::map& dataEncoding) +{ + if (kind == "ByteArray") { + auto type = dataEncoding["type"].as(); + result = byte_array_decode(rawData, static_cast(type)); + } else if (kind == "FixedPoint") { + auto factor = dataEncoding["factor"].as(); + auto srcType = dataEncoding["srcType"].as(); + result = fixed_array_decode(result, factor, static_cast(srcType)); + } else if (kind == "IntervalQuantization") { + auto min = dataEncoding["min"].as(); + auto max = dataEncoding["max"].as(); + auto numSteps = dataEncoding["numSteps"].as(); + auto srcType = dataEncoding["srcType"].as(); + result = interval_quant_decode(result, min, max, numSteps, static_cast(srcType)); + } else if (kind == "RunLength") { + auto srcType = dataEncoding["srcType"].as(); + auto srcSize = dataEncoding["srcSize"].as(); + result = run_length_decode(result, static_cast(srcType), srcSize); + } else if (kind == "Delta") { + auto origin = dataEncoding["origin"].as(); + auto srcType = dataEncoding["srcType"].as(); + result = delta_decode(result, origin, static_cast(srcType)); + } else if (kind == "IntegerPacking") { + auto byteCount = dataEncoding["byteCount"].as(); + auto srcSize = dataEncoding["srcSize"].as(); + auto isUnsigned = dataEncoding["isUnsigned"].as(); + result = integer_packing_decode(result, byteCount, srcSize, isUnsigned); + } else if (kind == "StringArray") { + auto indicesEncoding = dataEncoding["dataEncoding"].as>>(); + auto stringData = dataEncoding["stringData"].as(); + auto offsets = dataEncoding["offsets"].as>(); + auto offsetEncoding = dataEncoding["offsetEncoding"].as>>(); + result = string_array_decode(rawData, indicesEncoding, stringData, offsets, offsetEncoding); + } +} + +static std::vector parse_bcif_decode(const std::vector& rawData, + std::vector>& dataEncoding) +{ + std::vector result; + for (auto it = std::rbegin(dataEncoding); it != std::rend(dataEncoding); ++it) { + auto& dataEncode = *it; + parse_bcif_decode_kind( + dataEncode["kind"].as(), rawData, result, dataEncode); + } + return result; +} + + +bool cif_file::parse_bcif(const char* bytes, std::size_t size) +{ + m_datablocks.clear(); + m_tokens.clear(); + + auto oh = msgpack::unpack(bytes, size); + auto msgobj = oh.get(); + auto dict = msgobj.as>(); + + auto dataBlocksRaw = dict["dataBlocks"].as>(); + pymol::cif_detail::bcif_data* currentFrame{}; + auto& dataDict = m_datablocks; + for (const auto& block : dataBlocksRaw) { + auto blockMap = block.as>(); + auto header = blockMap["header"].as(); + auto categoriesRaw = blockMap["categories"].as>(); + auto& new_block = m_datablocks[header]; + new_block.m_data = pymol::cif_detail::bcif_data{}; + currentFrame = &std::get(new_block.m_data); + pymol::cif_data& categories = dataDict[header]; + categories.m_data = pymol::cif_detail::bcif_data{}; + auto& categoriesData = std::get(categories.m_data); + for (const auto& category : categoriesRaw) { + auto categoryMap = category.as>(); + auto categoryName = categoryMap["name"].as(); + std::transform(categoryName.begin(), categoryName.end(), + categoryName.begin(), ::tolower); + auto columnsRaw = categoryMap["columns"].as>(); + auto& columns = categoriesData.m_dict[categoryName]; + for (const auto& column : columnsRaw) { + auto columnMap = column.as>(); + auto columnName = columnMap["name"].as(); + std::transform(columnName.begin(), columnName.end(), + columnName.begin(), ::tolower); + auto dataRaw = columnMap["data"].as>(); + auto dataData = dataRaw["data"].as>(); + auto dataEncoding = dataRaw["encoding"].as>>(); + auto vec = parse_bcif_decode(dataData, dataEncoding); + columns[columnName] = std::move(vec); + } + } + dataDict[header] = std::move(categories); + } + return true; +} +#else +bool cif_file::parse_bcif(const char* bytes, std::size_t size) +{ + return false; +} +#endif // !defined(_PYMOL_NO_MSGPACKC) + } // namespace pymol // vi:sw=2:ts=2 diff --git a/layer2/CifFile.h b/layer2/CifFile.h index 3d18784a2..c6414acd8 100644 --- a/layer2/CifFile.h +++ b/layer2/CifFile.h @@ -12,10 +12,17 @@ #include #include #include +#include +#include // for pymol::default_free #include "MemoryDebug.h" +template +struct overloaded : Ts... { using Ts::operator()...; }; +template +overloaded(Ts...) -> overloaded; + namespace pymol { namespace _cif_detail { @@ -44,6 +51,11 @@ template T raw_to_typed(const char*); class cif_data; class cif_loop; class cif_array; +namespace cif_detail { + struct cif_str_data; + struct bcif_data; +}; +using CIFData = std::variant; /** * Class for reading CIF files. @@ -57,7 +69,7 @@ class cif_array; * * Iterate over data blocks: * @verbatim - for (auto& block : cf.datablocks()) { + for (auto& [code, block] : cf.datablocks()) { // data_ const char* code = block->code(); @@ -81,7 +93,7 @@ class cif_array; */ class cif_file { std::vector m_tokens; - std::vector m_datablocks; + std::map m_datablocks; std::unique_ptr m_contents; /** @@ -98,6 +110,14 @@ class cif_file { /// Parse CIF string bool parse_string(const char*); + /** + * Parse BinaryCIF blob + * @param bytes BinaryCIF blob + * @param size Blob size + * @post datablocks() is valid + */ + bool parse_bcif(const char* bytes, std::size_t size); + protected: /// Report a parsing error virtual void error(const char*); @@ -114,54 +134,112 @@ class cif_file { cif_file(const char* filename, const char* contents = nullptr); /// Data blocks - const std::vector& datablocks() const { return m_datablocks; } + const std::map& datablocks() const { return m_datablocks; } }; -/** - * View on a CIF data array. The viewed data is owned by the cif_file - */ -class cif_array { - friend class cif_file; -private: - enum { NOT_IN_LOOP = -1 }; +using CifArrayElement = std::variant; - // column index, -1 if not in loop - short col; +namespace cif_detail { + struct cif_str_array { + enum { NOT_IN_LOOP = -1 }; - // pointer to either loop or single value - union { - const cif_loop * loop; - const char * value; - } pointer; + // column index, -1 if not in loop + short col; - // Raw data value or nullptr for unknown/inapplicable and `pos >= size()` - const char* get_value_raw(unsigned pos = 0) const; + // pointer to either loop or single value + union { + const cif_loop * loop; + const char * value; + } pointer; - // point this array to a loop (only for parsing) - void set_loop(const cif_loop * loop, short col_) { - col = col_; - pointer.loop = loop; - }; + // Raw data value or NULL for unknown/inapplicable and `pos >= size()` + const char* get_value_raw(unsigned pos = 0) const; + + // point this array to a loop (only for parsing) + void set_loop(const cif_loop * loop, short col_) { + col = col_; + pointer.loop = loop; + }; - // point this array to a single value (only for parsing) - void set_value(const char * value) { - col = NOT_IN_LOOP; - pointer.value = value; + // point this array to a single value (only for parsing) + void set_value(const char * value) { + col = NOT_IN_LOOP; + pointer.value = value; + }; }; + struct bcif_array { + std::vector m_arr{}; + }; + + /** + * Returns a typed value from a CIF data element. + * If the element is missing or inapplicable, return `d`. + * @param var CIF data element + * @param d default value + * @return typed value + */ + template T var_to_typed(const CifArrayElement& var, const T& d) + { + if constexpr (std::is_same_v) { + auto& str = std::get(var); + return !str.empty() ? str.c_str() : d; + } else { + if (auto ptr = std::get_if(&var); ptr && ptr->empty()) { + return d; + } + if constexpr (!std::is_same_v) { + return std::visit(overloaded{[](const std::string& s) -> T { + return _cif_detail::raw_to_typed( + s.c_str()); + }, + [](const auto& v) -> T { return v; }}, + var); + } + } + return d; + } +} + +/** + * View on a CIF data array. The viewed data is owned by the cif_file + */ +class cif_array { + friend class cif_file; + +private: + mutable std::string m_internal_str_cache; + std::variant m_array; public: // constructor cif_array() = default; // constructor (only needed for EMPTY_ARRAY) - cif_array(std::nullptr_t) { set_value(nullptr); } + cif_array(std::nullptr_t) { + if (auto arr = std::get_if(&m_array)) { + arr->set_value(nullptr); + } else if (auto arr = std::get_if(&m_array)) { + arr->m_arr.clear(); + } + } + + cif_array(std::vector&& arr) { + m_array = cif_detail::bcif_array{std::move(arr)}; + } /// Number of elements in this array (= number of rows in loop) unsigned size() const; /// True if value in ['.', '?'] - bool is_missing(unsigned pos = 0) const { return !get_value_raw(pos); } + bool is_missing(unsigned pos = 0) const { + if (auto arr = std::get_if(&m_array)) { + return !arr->get_value_raw(pos); + } else { + return false; + } + } /// True if all values in ['.', '?'] bool is_missing_all() const; @@ -172,8 +250,16 @@ class cif_array { * @param d default value for unknown/inapplicable elements */ template T as(unsigned pos = 0, T d = T()) const { - const char* s = get_value_raw(pos); - return s ? _cif_detail::raw_to_typed(s) : d; + if (auto arr = std::get_if(&m_array)) { + const char* s = arr->get_value_raw(pos); + return s ? _cif_detail::raw_to_typed(s) : d; + } else if (auto arr = std::get_if(&m_array)) { + if (pos >= arr->m_arr.size()) + return d; + auto& var = arr->m_arr[pos]; + return cif_detail::var_to_typed(var, d); + } + return d; } /** @@ -184,7 +270,25 @@ class cif_array { * @param d default value for unknown/inapplicable elements */ const char* as_s(unsigned pos = 0, const char* d = "") const { - return as(pos, d); + if (std::get_if(&m_array)) { + return as(pos, d); + } else if (auto arr = std::get_if(&m_array)) { + if (pos >= arr->m_arr.size()) + return d; + if (auto str_ptr = std::get_if(&arr->m_arr[pos])) { + return str_ptr->c_str(); + } + m_internal_str_cache = std::visit([](auto&& arg) -> std::string { + if constexpr (std::is_same_v, + std::string>) { + return arg; + } else { + return std::to_string(arg); + } + }, arr->m_arr[pos]); + return m_internal_str_cache.c_str(); + } + return d; } /// Alias for as() @@ -210,17 +314,33 @@ class cif_array { /** * CIF data block. The viewed data is owned by the cif_file. */ -class cif_data { - friend class cif_file; - // data_ - const char* m_code = nullptr; +namespace cif_detail { + struct cif_str_data { + // data_ + const char* m_code = nullptr; + + std::map<_cif_detail::zstring_view, cif_array> m_dict; + std::map m_dict_str; + std::map<_cif_detail::zstring_view, cif_detail::cif_str_data> m_saveframes; + + // only needed for freeing + std::vector> m_loops; + }; + + using ColumnMap = std::map>; + using CategoryMap = std::map; + using DataBlockMap = std::map; + struct bcif_data { + std::string m_code; + std::map> m_dict; + }; +} - std::map<_cif_detail::zstring_view, cif_array> m_dict; - std::map<_cif_detail::zstring_view, cif_data> m_saveframes; +class cif_data { + friend class cif_file; - // only needed for freeing - std::vector> m_loops; + CIFData m_data; // generic default value static const cif_array* empty_array(); @@ -234,7 +354,7 @@ class cif_data { cif_data& operator=(cif_data&&) = default; /// Block code (never nullptr) - const char* code() const { return m_code ? m_code : ""; } + const char* code() const; // Get a pointer to array or nullptr if not found const cif_array* get_arr(const char* key) const; @@ -253,7 +373,7 @@ class cif_data { } /// Get a pointer to a save frame or nullptr if not found - const cif_data* get_saveframe(const char* code) const; + const cif_detail::cif_str_data* get_saveframe(const char* code) const; }; } // namespace pymol diff --git a/layer2/CifMoleculeReader.cpp b/layer2/CifMoleculeReader.cpp index 29a303b06..5622ea2da 100644 --- a/layer2/CifMoleculeReader.cpp +++ b/layer2/CifMoleculeReader.cpp @@ -435,7 +435,7 @@ static bond_dict_t * get_global_components_bond_dict(PyMOLGlobals * G) { return nullptr; } - for (const auto& datablock : cif.datablocks()) { + for (const auto& [code, datablock] : cif.datablocks()) { read_chem_comp_bond_dict(&datablock, bond_dict); } } @@ -2264,7 +2264,7 @@ pymol::Result ObjectMoleculeReadCifStr(PyMOLGlobals * G, Object return pymol::make_error("Parsing CIF file failed: ", cif->m_error_msg); } - for (const auto& datablock : cif->datablocks()) { + for (const auto& [code, datablock] : cif->datablocks()) { ObjectMolecule * obj = ObjectMoleculeReadCifData(G, &datablock, discrete, quiet); if (!obj) { @@ -2330,7 +2330,7 @@ const bond_dict_t::mapped_type * bond_dict_t::get(PyMOLGlobals * G, const char * return nullptr; } - for (auto& item : cif.datablocks()) + for (auto& [code, item] : cif.datablocks()) read_chem_comp_bond_dict(&item, *this); } } @@ -2352,4 +2352,54 @@ const bond_dict_t::mapped_type * bond_dict_t::get(PyMOLGlobals * G, const char * return nullptr; } + +/////////////////////////////////////// + +pymol::Result ObjectMoleculeReadBCif(PyMOLGlobals* G, + ObjectMolecule* I, const char* bytes, std::size_t size, int frame, + int discrete, int quiet, int multiplex, int zoom) +{ +#ifdef _PYMOL_NO_MSGPACKC + PRINTFB(G, FB_ObjectMolecule, FB_Errors) + " Error: This build has no BinaryCIF support.\n" + " Please install/enable msgpack-c.\n" + ENDFB(G); + return nullptr; +#endif + + if (I) { + return pymol::Error("loading BCIF into existing object not supported, " + "please use 'create' to append to an existing object."); + } + + if (multiplex > 0) { + return pymol::Error("loading BCIF with multiplex=1 not supported, please " + "use 'split_states' after loading the object."); + } + + auto cif = std::make_shared(); + cif->parse_bcif(bytes, size); + + for (const auto& [code, datablock] : cif->datablocks()) { + auto obj = ObjectMoleculeReadCifData(G, &datablock, discrete, quiet); + if (!obj) { + PRINTFB(G, FB_ObjectMolecule, FB_Warnings) + " BCIF-Warning: no coordinates found in data_%s\n", datablock.code() ENDFB(G); + continue; + } + +#ifndef _PYMOL_NOPY + // we only provide access from the Python API so far + if (SettingGet(G, cSetting_cif_keepinmemory)) { + obj->m_cifdata = &datablock; + obj->m_ciffile = cif; + } +#endif + + if (cif->datablocks().size() == 1 || multiplex == 0) + return obj; + } + return nullptr; +} + // vi:sw=2:ts=2:expandtab diff --git a/layer2/ObjectMolecule.h b/layer2/ObjectMolecule.h index f79ff4292..d158e7d9b 100644 --- a/layer2/ObjectMolecule.h +++ b/layer2/ObjectMolecule.h @@ -518,6 +518,9 @@ ObjectMolecule *ObjectMoleculeReadMmtfStr(PyMOLGlobals * G, ObjectMolecule * I, const char *st, int st_len, int frame, int discrete, int quiet, int multiplex, int zoom); pymol::Result ObjectMoleculeReadCifStr(PyMOLGlobals * G, ObjectMolecule * I, const char *st, int frame, int discrete, int quiet, int multiplex, int zoom); +pymol::Result ObjectMoleculeReadBCif(PyMOLGlobals* G, + ObjectMolecule* I, const char* bytes, std::size_t size, int frame, + int discrete, int quiet, int multiplex, int zoom); std::unique_ptr LoadTrajSeleHelper( const ObjectMolecule* obj, CoordSet* cs, const char* selection); diff --git a/layer3/Executive.cpp b/layer3/Executive.cpp index 29290dc9a..cea8aa1cc 100644 --- a/layer3/Executive.cpp +++ b/layer3/Executive.cpp @@ -3693,6 +3693,7 @@ ExecutiveLoadPrepareArgs(PyMOLGlobals * G, case cLoadTypeSDF2Str: case cLoadTypeXYZStr: case cLoadTypeDXStr: + case cLoadTypeBCIFStr: if (!content) { return pymol::Error("content is nullptr"); } @@ -3715,6 +3716,7 @@ ExecutiveLoadPrepareArgs(PyMOLGlobals * G, case cLoadTypeSDF2: case cLoadTypeXYZ: case cLoadTypeDXMap: + case cLoadTypeBCIF: if (content) { fname_null_ok = true; break; @@ -3883,6 +3885,13 @@ pymol::Result<> ExecutiveLoad(PyMOLGlobals* G, ExecutiveLoadArgs const& args) p_return_if_error(res); obj = res.result(); } break; + case cLoadTypeBCIF: + case cLoadTypeBCIFStr: { + auto res = ObjectMoleculeReadBCif(G, static_cast(origObj), + content, size, state, discrete, quiet, multiplex, zoom); + p_return_if_error(res); + obj = res.result(); + } break; case cLoadTypeMMTF: case cLoadTypeMMTFStr: obj = ObjectMoleculeReadMmtfStr(G, (ObjectMolecule *) origObj, diff --git a/layer3/Executive.h b/layer3/Executive.h index 69a8cef3c..c40a552dd 100644 --- a/layer3/Executive.h +++ b/layer3/Executive.h @@ -127,6 +127,8 @@ enum cLoadType_t : int { cLoadTypeCCP4UnspecifiedStr = 76, cLoadTypeMRCStr = 77, + cLoadTypeBCIF = 78, + cLoadTypeBCIFStr = 79, }; /* NOTE: if you add new content/object type above, then be sure to add diff --git a/layerCTest/Test_CifFile.cpp b/layerCTest/Test_CifFile.cpp index 67a05c8b0..77108d287 100644 --- a/layerCTest/Test_CifFile.cpp +++ b/layerCTest/Test_CifFile.cpp @@ -45,17 +45,17 @@ TEST_CASE("misc", "[CifFile]") REQUIRE(cf1.datablocks().size() == 3); REQUIRE(cf2.datablocks().size() == 3); REQUIRE(cf3.datablocks().size() == 3); - REQUIRE(cf1.datablocks()[2].get_opt("_undotted_key")->as_s() == std::string("why not")); - REQUIRE(cf2.datablocks()[2].get_opt("_undotted_key")->as_s() == std::string("why not")); - REQUIRE(cf3.datablocks()[2].get_opt("_undotted_key")->as_s() == std::string("why not")); + REQUIRE(cf1.datablocks().find("baz")->second.get_opt("_undotted_key")->as_s() == std::string("why not")); + REQUIRE(cf2.datablocks().find("baz")->second.get_opt("_undotted_key")->as_s() == std::string("why not")); + REQUIRE(cf3.datablocks().find("baz")->second.get_opt("_undotted_key")->as_s() == std::string("why not")); auto& blocks = cf1.datablocks(); - REQUIRE(blocks[0].code() == std::string("FOO")); - REQUIRE(blocks[1].code() == std::string("bar")); - REQUIRE(blocks[2].code() == std::string("baz")); + REQUIRE(blocks.find("FOO")->second.code() == std::string("FOO")); + REQUIRE(blocks.find("bar")->second.code() == std::string("bar")); + REQUIRE(blocks.find("baz")->second.code() == std::string("baz")); - auto* data = &blocks.front(); + auto* data = &blocks.find("FOO")->second; REQUIRE(data->get_arr("_cat1.key3") != nullptr); REQUIRE(data->get_arr("_cat1.key3") == data->get_opt("_cat1.key3")); @@ -137,15 +137,15 @@ TEST_CASE("misc", "[CifFile]") REQUIRE(data->get_arr("_cat2_key1") == nullptr); REQUIRE(data->get_opt("_cat2?key1")->as_i(0, 99) == 10); - REQUIRE(blocks[2].get_arr("_undotted.key") == nullptr); - REQUIRE(blocks[2].get_opt("_undotted?key")->as_s() == std::string("why not")); + REQUIRE(blocks.find("baz")->second.get_arr("_undotted.key") == nullptr); + REQUIRE(blocks.find("baz")->second.get_opt("_undotted?key")->as_s() == std::string("why not")); // float parsing - REQUIRE(blocks[2].get_opt("_typed_float1")->as() == Approx(1230.f)); - REQUIRE(blocks[2].get_opt("_typed_float1")->as() == Approx(1230.00000)); - REQUIRE(blocks[2].get_opt("_typed_float2")->as() == Approx(12.3400000)); - REQUIRE(blocks[2].get_opt("_typed_float3")->as() == Approx(1.23456789)); + REQUIRE(blocks.find("baz")->second.get_opt("_typed_float1")->as() == Approx(1230.f)); + REQUIRE(blocks.find("baz")->second.get_opt("_typed_float1")->as() == Approx(1230.00000)); + REQUIRE(blocks.find("baz")->second.get_opt("_typed_float2")->as() == Approx(12.3400000)); + REQUIRE(blocks.find("baz")->second.get_opt("_typed_float3")->as() == Approx(1.23456789)); } // vi:sw=2:expandtab diff --git a/modules/pymol/constants.py b/modules/pymol/constants.py index e17cb49ad..dc19cdebc 100644 --- a/modules/pymol/constants.py +++ b/modules/pymol/constants.py @@ -59,6 +59,8 @@ class _loadable: dxstr = 75 # DX file (APBS) mapstr = 76 # unspecified CCP4 or MRC map mrcstr = 77 + bcif = 78 + bcifstr = 79 class loadable(_loadable): @classmethod @@ -82,7 +84,8 @@ def _reverse_lookup(cls, number): loadable.map : loadable.mapstr, loadable.dx : loadable.dxstr, loadable.xyz : loadable.xyzstr, - loadable.sdf2 : loadable.sdf2str} + loadable.sdf2 : loadable.sdf2str, + loadable.bcif : loadable.bcifstr} sanitize_alpha_list_re = re.compile(r"[^a-zA-Z0-9_\'\"\.\-\[\]\,]+") nt_hidden_path_re = re.compile(r"\$[\/\\]") diff --git a/modules/pymol/importing.py b/modules/pymol/importing.py index 318ad6696..da323aa02 100644 --- a/modules/pymol/importing.py +++ b/modules/pymol/importing.py @@ -89,6 +89,11 @@ def filename_to_format(filename): format = 'pdbml' elif ext in ('mmcif',): format = 'cif' + elif ext in ('bcif',): + format = 'bcif' + elif ext in ('bcifgz',): + format = 'bcif' + zipped = 'gz' elif re.match(r'pdb\d+$', ext): format = 'pdb' elif re.match(r'xyz_\d+$', ext): @@ -1134,6 +1139,9 @@ def finish_object(name, *, _self=cmd): "/data/structures/divided/mmCIF/{mid}/{code}.cif.gz", "https://files-versioned.wwpdb.org/pdb_versioned/views/latest/coordinates/mmcif/{mid}/pdb_{code:0>8}/pdb_{code:0>8}_xyz.cif.gz", ], + "bcif" : [ + "https://models.rcsb.org/{code}.{type}.gz", + ], "2fofc" : "https://www.ebi.ac.uk/pdbe/coordinates/files/{code}.ccp4", "fofc" : "https://www.ebi.ac.uk/pdbe/coordinates/files/{code}_diff.ccp4", "pubchem": [ @@ -1182,6 +1190,8 @@ def _fetch(code, name, state, finish, discrete, multiplex, zoom, type, path, nameFmt = '{type}_{code}.sdf' elif type == 'cif': pass + elif type == 'bcif': + pass elif type == 'mmtf': pass elif type == 'cc': @@ -1300,7 +1310,7 @@ def _multifetch(code,name,state,finish,discrete,multiplex,zoom,type,path,file,qu obj_name = 'emd_' + obj_code chain = None - if (len(obj_code) > 4 and type in ('pdb', 'cif', 'mmtf') and + if (len(obj_code) > 4 and type in ('pdb', 'cif', 'mmtf', 'bcif') and # "Extended PDB accession codes" have 8 characters, # try to distinguish by leading non-zero digit '1' <= obj_code[0] <= '9'): @@ -1344,8 +1354,8 @@ def fetch(code, name='', state=0, finish=1, discrete=-1, state = the state number into which the file should loaded. - type = str: cif, pdb, pdb1, 2fofc, fofc, emd, cid, sid {default: cif - (default was "pdb" up to 1.7.6)} + type = str: cif, bcif, pdb, pdb1, 2fofc, fofc, emd, cid, sid + {default: cif (default was "pdb" up to 1.7.6)} async_ = 0/1: download in the background and do not block the PyMOL command line {default: 0 -- changed in PyMOL 2.3} diff --git a/testing/data/115d.bcif.gz b/testing/data/115d.bcif.gz new file mode 100644 index 000000000..08a37fcde Binary files /dev/null and b/testing/data/115d.bcif.gz differ diff --git a/testing/tests/api/test_importing.py b/testing/tests/api/test_importing.py new file mode 100644 index 000000000..432eef8dd --- /dev/null +++ b/testing/tests/api/test_importing.py @@ -0,0 +1,26 @@ +from pymol import cmd +from pymol import test_utils +from pymol.querying import cif_get_array + + +@test_utils.requires_version("3.0") +def test_bcif(): + cmd.load(test_utils.datafile("115d.bcif.gz")) + assert cmd.count_atoms() == 407 + +@test_utils.requires_version("3.0") +def test_bcif_array(): + obj_name = "foo" + cmd.set('cif_keepinmemory', 1) + cmd.load(test_utils.datafile("115d.bcif.gz"), object=obj_name) + arr = cif_get_array(obj_name, "_pdbx_database_status.entry_id", "s") + assert arr == ["115D"] + + arr = cif_get_array(obj_name, "_entity_poly.pdbx_strand_id", "s") + assert arr == ["A,B"] + + arr = cif_get_array(obj_name, "_pdbx_struct_oper_list.name", "s") + assert arr == ["1_555"] + + arr = cif_get_array(obj_name, "_pdbx_struct_assembly.oligomeric_count", "i") + assert arr == [2]