diff --git a/java/gradle.properties b/java/gradle.properties new file mode 100644 index 00000000..e0c7cefc --- /dev/null +++ b/java/gradle.properties @@ -0,0 +1 @@ +binexportProjectName = BinExport diff --git a/java/settings.gradle b/java/settings.gradle new file mode 100644 index 00000000..6faef2df --- /dev/null +++ b/java/settings.gradle @@ -0,0 +1 @@ +rootProject.name = binexportProjectName diff --git a/java/src/main/proto/binexport2.proto b/java/src/main/proto/binexport2.proto deleted file mode 100644 index 4e152350..00000000 --- a/java/src/main/proto/binexport2.proto +++ /dev/null @@ -1,409 +0,0 @@ -// Copyright 2011-2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file describes a compact representation for disassembled binaries. It is -// loosely based on the PostgreSQL database schema used by BinNavi's -// postgresql_tables.sql (https://git.io/vzlYw). -// It is the output format for the BinExport IDA plugin and the BinDetego -// disassembler and consumed by the BinDiff comparison engine. - -// The representation is generic to accommodate various source architectures. -// In particular 32 and 64 bit versions of x86, ARM, PowerPC and MIPS have been -// tested. -// -// Multiple levels of deduping have been applied to make the format more compact -// and avoid redundant data duplication. Some of this due to hard-earned -// experience trying to cope with intentionally obfuscated malicious binaries. -// Note in particular that the same instruction may occur in multiple basic -// blocks and the same basic block in multiple functions (instruction and basic -// block sharing). Implemented naively, malware can use this to cause -// combinatorial explosion in memory usage, DOSing the analyst. This format -// should store every unique expression, mnemonic, operand, instruction and -// basic block only once instead of duplicating the information for every -// instance of it. -// -// This format does _not_ try to be 100% backwards compatible with the old -// version. In particular, we do not store IDA's comment types, making lossless -// porting of IDA comments impossible. We do however, store comments and -// expression substitutions, so porting the actual data is possible, just not -// the exact IDA type. -// -// While it would be more natural to use addresses when defining call graph and -// flow graph edges and other such references, it is more efficient to employ -// one more level of indirection and use indices into the basic block or -// function arrays instead. This is because addresses will usually use most of -// the available 64 bit space while indices will be much smaller and compress -// much better (less randomly distributed). -// -// We omit all fields that are set to their default value anyways. Note that -// this has two side effects: -// - changing the defaults in this proto file will, in effect, change what's -// read from disk -// - the generated code has_* methods are somewhat less useful -// WARNING: We omit the defaults manually in the code writing the data. Do not -// change the defaults here without changing the code! -// -// TODO(cblichmann): Link flow graphs to call graph nodes. The connection is -// there via the address, but tricky to extract. - -syntax = "proto2"; - -option java_package = "com.google.security.zynamics"; -option java_outer_classname = "BinExport"; - -message BinExport2 { - message Meta { - reserved 5; // Pre-BinDiff 4.3 padding - - // Input binary filename including file extension but excluding file path. - // example: "insider_gcc.exe" - optional string executable_name = 1; - - // Application defined executable id. Often the SHA256 hash of the input - // binary. - optional string executable_id = 2; - - // Input architecture name, e.g. x86-32. - optional string architecture_name = 3; - - // When did this file get created? Unix time. This may be used for some - // primitive versioning in case the file format ever changes. - optional int64 timestamp = 4; - } - - message CallGraph { - message Vertex { - enum Type { - // Regular function with full disassembly. - NORMAL = 0; - - // This function is a well known library function. - LIBRARY = 1; - - // Imported from a dynamic link library (e.g. dll). - IMPORTED = 2; - - // A thunk function, forwarding its work via an unconditional jump. - THUNK = 3; - - // An invalid function (a function that contained invalid code or was - // considered invalid by some heuristics). - INVALID = 4; - } - - // The function's entry point address. - optional uint64 address = 1; - optional Type type = 2 [default = NORMAL]; - - // If the function has a user defined, real name it will be given here. - // main() is a proper name, sub_BAADF00D is not (auto generated dummy - // name). - optional string mangled_name = 3; - - // Demangled name if the function is a mangled C++ function and we could - // demangle it. - optional string demangled_name = 4; - - // If this is a library function, what is its index in library arrays. - optional int32 library_index = 5; - - // If module name, such as class name for DEX files, is present - index in - // module table. - optional int32 module_index = 6; - } - - message Edge { - // source and target index into the vertex repeated field. - optional int32 source_vertex_index = 1; - optional int32 target_vertex_index = 2; - } - - // vertices == functions in the call graph. - repeated Vertex vertex = 1; - - // edges == calls in the call graph. - repeated Edge edge = 2; - } - - // An operand consists of 1 or more expressions, linked together as a tree. - message Expression { - enum Type { - SYMBOL = 1; - IMMEDIATE_INT = 2; - IMMEDIATE_FLOAT = 3; - OPERATOR = 4; - REGISTER = 5; - SIZE_PREFIX = 6; - DEREFERENCE = 7; - } - - // IMMEDIATE_INT is by far the most common type and thus we can save some - // space by omitting it as the default. - optional Type type = 1 [default = IMMEDIATE_INT]; - - // Symbol for this expression. Interpretation depends on type. Examples - // include: "eax", "[", "+" - optional string symbol = 2; - - // If the expression can be interpreted as an integer value (IMMEDIATE_INT) - // the value is given here. - optional uint64 immediate = 3; - - // The parent expression. Example expression tree for the second operand of: - // mov eax, b4 [ebx + 12] - // "b4" --- "[" --- "+" --- "ebx" - // \ "12" - optional int32 parent_index = 4; - - // true if the expression has entry in relocation table - optional bool is_relocation = 5; - } - - // An instruction may have 0 or more operands. - message Operand { - // Contains all expressions constituting this operand. All expressions - // should be linked into a single tree, i.e. there should only be one - // expression in this list with parent_index == NULL and all others should - // descend from that. Rendering order for expressions on the same tree level - // (siblings) is implicitly given by the order they are referenced in this - // repeated field. - // Implicit: expression sequence - repeated int32 expression_index = 1; - } - - // An instruction has exactly 1 mnemonic. - message Mnemonic { - // Literal representation of the mnemonic, e.g.: "mov". - optional string name = 1; - } - - message Instruction { - // This will only be filled for instructions that do not just flow from the - // immediately preceding instruction. Regular instructions will have to - // calculate their own address by adding raw_bytes.size() to the previous - // instruction's address. - optional uint64 address = 1; - - // If this is a call instruction and call targets could be determined - // they'll be given here. Note that we may or may not have a flow graph for - // the target and thus cannot use an index into the flow graph table here. - // We could potentially use call graph nodes, but linking instructions to - // the call graph directly does not seem a good choice. - repeated uint64 call_target = 2; - - // Index into the mnemonic array of strings. Used for de-duping the data. - // The default value is used for the most common mnemonic in the executable. - optional int32 mnemonic_index = 3 [default = 0]; - - // Indices into the operand tree. On X86 this can be 0, 1 or 2 elements - // long, 3 elements with VEX/EVEX. - // Implicit: operand sequence - repeated int32 operand_index = 4; - - // The unmodified input bytes corresponding to this instruction. - optional bytes raw_bytes = 5; - - // Implicit: comment sequence - repeated int32 comment_index = 6; - } - - message BasicBlock { - // This is a space optimization. The instructions for an individual basic - // block will usually be in a continuous index range. Thus it is more - // efficient to store the range instead of individual indices. However, this - // does not hold true for all basic blocks, so we need to be able to store - // multiple index ranges per block. - message IndexRange { - // These work like begin and end iterators, i.e. the sequence is - // [begin_index, end_index). If the sequence only contains a single - // element end_index will be omitted. - optional int32 begin_index = 1; - optional int32 end_index = 2; - } - - // Implicit: instruction sequence - repeated IndexRange instruction_index = 1; - } - - message FlowGraph { - message Edge { - enum Type { - CONDITION_TRUE = 1; - CONDITION_FALSE = 2; - UNCONDITIONAL = 3; - SWITCH = 4; - } - - // Source instruction will always be the last instruction of the source - // basic block, target instruction the first instruction of the target - // basic block. - optional int32 source_basic_block_index = 1; - optional int32 target_basic_block_index = 2; - optional Type type = 3 [default = UNCONDITIONAL]; - - // Indicates whether this is a loop edge as determined by Lengauer-Tarjan. - optional bool is_back_edge = 4 [default = false]; - } - - // Basic blocks are sorted by address. - repeated int32 basic_block_index = 1; - - // The flow graph's entry point address is the first instruction of the - // entry_basic_block. - optional int32 entry_basic_block_index = 3; - - repeated Edge edge = 2; - } - - // Generic reference class used for address comments (deprecated), string - // references and expression substitutions. It allows referencing from an - // instruction, operand, expression subtree tuple to a de-duped string in the - // string table. - message Reference { - // Index into the global instruction table. - optional int32 instruction_index = 1; - - // Index into the operand array local to an instruction. - optional int32 instruction_operand_index = 2 [default = 0]; - - // Index into the expression array local to an operand. - optional int32 operand_expression_index = 3 [default = 0]; - - // Index into the global string table. - optional int32 string_table_index = 4; - } - - message DataReference { - // Index into the global instruction table. - optional int32 instruction_index = 1; - - // Address being referred. - optional uint64 address = 2; - } - - message Comment { - enum Type { - // A regular instruction comment. Typically displayed next to the - // instruction disassembly. - DEFAULT = 0; - - // A comment line that is typically displayed before (above) the - // instruction it refers to. - ANTERIOR = 1; - - // Like ANTERIOR, but a typically displayed after (below). - POSTERIOR = 2; - - // Similar to an ANTERIOR comment, but applies to the beginning of an - // identified function. Programs displaying the proto may choose to render - // these differently (e.g. above an inferred function signature). - FUNCTION = 3; - - // Named constants, bitfields and similar. - ENUM = 4; - - // Named locations, usually the target of a jump. - LOCATION = 5; - - // Data cross references. - GLOBAL_REFERENCE = 6; - - // Local/stack variables. - LOCAL_REFERENCE = 7; - } - - // Index into the global instruction table. This is here to enable - // comment processing without having to iterate over all instructions. - // There is an N:M mapping of instructions to comments. - optional int32 instruction_index = 1; - - // Index into the operand array local to an instruction. - optional int32 instruction_operand_index = 2 [default = 0]; - - // Index into the expression array local to an operand, like in Reference. - // This is not currently used, but allows to implement expression - // substitutions. - optional int32 operand_expression_index = 3 [default = 0]; - - // Index into the global string table. - optional int32 string_table_index = 4; - - // Comment is propagated to all locations that reference the original - // location. - optional bool repeatable = 5; - - optional Type type = 6 [default = DEFAULT]; - } - - message Section { - // Section start address. - optional uint64 address = 1; - - // Section size. - optional uint64 size = 2; - - // Read flag of the section, True when section is readable. - optional bool flag_r = 3; - - // Write flag of the section, True when section is writable. - optional bool flag_w = 4; - - // Execute flag of the section, True when section is executable. - optional bool flag_x = 5; - } - - message Library { - // If this library is statically linked. - optional bool is_static = 1; - - // Address where this library was loaded, 0 if unknown. - optional uint64 load_address = 2 [default = 0]; - - // Name of the library (format is platform-dependent). - optional string name = 3; - } - - message Module { - // Name, such as Java class name. Platform-dependent. - optional string name = 1; - } - - optional Meta meta_information = 1; - repeated Expression expression = 2; - repeated Operand operand = 3; - repeated Mnemonic mnemonic = 4; - repeated Instruction instruction = 5; - repeated BasicBlock basic_block = 6; - repeated FlowGraph flow_graph = 7; - optional CallGraph call_graph = 8; - - repeated string string_table = 9; - - // No longer written. This is here so that BinDiff can work with older - // BinExport files. - repeated Reference address_comment = 10 [deprecated = true]; - - // Rich comment index used for BinDiff's comment porting. - repeated Comment comment = 17; - repeated Reference string_reference = 11; - repeated Reference expression_substitution = 12; - repeated Section section = 13; - - repeated Library library = 14; - repeated DataReference data_reference = 15; - repeated Module module = 16; - - // Allow for future extensions. - extensions 100000000 to max; -} diff --git a/java/src/main/proto/binexport2.proto b/java/src/main/proto/binexport2.proto new file mode 120000 index 00000000..96dce6a4 --- /dev/null +++ b/java/src/main/proto/binexport2.proto @@ -0,0 +1 @@ +../../../../binexport2.proto \ No newline at end of file