diff --git a/floss/language/rust/extract.py b/floss/language/rust/extract.py index 4d40c3af9..37f6f7b96 100644 --- a/floss/language/rust/extract.py +++ b/floss/language/rust/extract.py @@ -15,6 +15,7 @@ find_mov_xrefs, find_push_xrefs, get_rdata_section, + get_raw_xrefs_rdata_i386, get_struct_string_candidates, ) @@ -168,7 +169,8 @@ def get_string_blob_strings(pe: pefile.PE, min_length: int) -> Iterable[StaticSt xrefs_lea = find_lea_xrefs(pe) xrefs_push = find_push_xrefs(pe) xrefs_mov = find_mov_xrefs(pe) - xrefs = itertools.chain(struct_string_addrs, xrefs_lea, xrefs_push, xrefs_mov) + xrefs_raw_rdata = get_raw_xrefs_rdata_i386(pe, rdata_section.get_data()) + xrefs = itertools.chain(struct_string_addrs, xrefs_lea, xrefs_push, xrefs_mov, xrefs_raw_rdata) elif pe.FILE_HEADER.Machine == pefile.MACHINE_TYPE["IMAGE_FILE_MACHINE_AMD64"]: xrefs_lea = find_lea_xrefs(pe) diff --git a/floss/language/utils.py b/floss/language/utils.py index 124b0ecad..2a56dd7f8 100644 --- a/floss/language/utils.py +++ b/floss/language/utils.py @@ -473,6 +473,42 @@ def get_struct_string_candidates(pe: pefile.PE) -> Iterable[StructString]: # dozens of seconds or more (suspect many minutes). +def get_raw_xrefs_rdata_i386(pe: pefile.PE, buf: bytes) -> Iterable[VA]: + """ + scan for raw xrefs that are 32-bit absolute addresses in the PE file (i386). + They are not encoded as struct String instances. + + example: + .rdata:004D6234 dd offset unk_4C85C9 + .rdata:004D6238 dd offset unk_4C85C3 + .rdata:004D623C dd offset unk_4C85BB + .rdata:004D6240 dd offset unk_4C85B3 + + From the disassembly, they are called as follows: + .text:00498E56 push ds:off_4D61E0[ecx*4] + + The above are not struct String instances, but are references to strings in the PE file. + They can be used to divide the string blobs into smaller chunks. + """ + format = "I" + + if not buf: + return + + low, high = get_image_range(pe) + + # using array module as a high-performance way to access the data as fixed-sized words. + words = iter(array.array(format, buf)) + + last = next(words) + for current in words: + address = last + last = current + + if address != 0x0 and low <= address < high: + yield address + + def get_extract_stats( pe: pefile, all_ss_strings: List[StaticString], lang_strings: List[StaticString], min_len: int, min_blob_len=0 ) -> float: diff --git a/tests/test_language_extract_rust.py b/tests/test_language_extract_rust.py index 9d210ce0c..a7c300236 100644 --- a/tests/test_language_extract_rust.py +++ b/tests/test_language_extract_rust.py @@ -80,3 +80,21 @@ def test_push(request, string, offset, encoding, rust_strings): ) def test_mov_jmp(request, string, offset, encoding, rust_strings): assert StaticString(string=string, offset=offset, encoding=encoding) in request.getfixturevalue(rust_strings) + + +@pytest.mark.parametrize( + "string,offset,encoding,rust_strings", + [ + # .rdata:004BFA48 dd offset unk_4BA13A + # .rdata:004BFA4C dd offset unk_4BA100 + pytest.param("Invalid branch target in DWARF expression", 0xB813A, StringEncoding.UTF8, "rust_strings32"), + pytest.param( + "Expected to find an FDE pointer, but found a CIE pointer instead.", + 0xB8163, + StringEncoding.UTF8, + "rust_strings32", + ), + ], +) +def test_raw_xrefs(request, string, offset, encoding, rust_strings): + assert StaticString(string=string, offset=offset, encoding=encoding) in request.getfixturevalue(rust_strings)