Skip to content

Commit

Permalink
Merge pull request #16 from glmcdona/glmcdona/msbuild-fix
Browse files Browse the repository at this point in the history
Fix msbuild with newer compilers
  • Loading branch information
glmcdona authored Jul 8, 2023
2 parents 6f0f188 + 2c812c2 commit 080a585
Show file tree
Hide file tree
Showing 6 changed files with 611 additions and 610 deletions.
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,11 @@
from pybind11.setup_helpers import Pybind11Extension, build_ext
from setuptools import setup, Extension, find_packages

__version__ = "0.1.7"
__version__ = "0.1.8"

ext_modules = [
Pybind11Extension("binary2strings",
sorted(glob("src/*.cpp")), # Sort source files for reproducibility
headers = sorted(glob("src/*.hpp")),
define_macros = [('VERSION_INFO', __version__)],
include_dirs = ["src"],
),
Expand Down
30 changes: 15 additions & 15 deletions src/binary2strings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

using namespace std;

size_t try_utf8_char_step(const unsigned char* buffer, size_t buffer_size, long offset)
size_t try_utf8_char_step(const unsigned char* buffer, size_t buffer_size, size_t offset)
{
// Returns 0 if it's not likely a valid utf8 character. For ascii range of characters it requires
// the character to be a displayable character.
Expand Down Expand Up @@ -99,14 +99,14 @@ int get_language_group(char16_t c)
// Switch the definition based on platform:
#if defined(_WIN32) || defined(_WIN64)
// Note: Buffer overrun security checks disabled, since they added ~50% overhead.
__declspec(safebuffers) extracted_string* try_extract_string(const unsigned char* buffer, size_t buffer_size, long offset, size_t min_chars)
__declspec(safebuffers) extracted_string* try_extract_string(const unsigned char* buffer, size_t buffer_size, size_t offset, size_t min_chars)
#else
extracted_string* try_extract_string(const unsigned char* buffer, size_t buffer_size, long offset, size_t min_chars)
extracted_string* try_extract_string(const unsigned char* buffer, size_t buffer_size, size_t offset, size_t min_chars)
#endif
{
// Try extracting the string as either utf8 or unicode wchar format. Returns None if it's not a valid string.
int i;
int char_count;
size_t i;
size_t char_count;

// Try to parse as utf8 first
size_t utf_char_len;
Expand Down Expand Up @@ -192,7 +192,7 @@ extracted_string* try_extract_string(const unsigned char* buffer, size_t buffer_
}


std::tuple<string, string, std::pair<int, int>, bool> try_extract_string_tuple(const unsigned char* buffer, size_t buffer_size, long offset, size_t min_chars, bool only_interesting)
std::tuple<string, string, std::pair<size_t, size_t>, bool> try_extract_string_tuple(const unsigned char* buffer, size_t buffer_size, size_t offset, size_t min_chars, bool only_interesting)
{
// Simple wrapper to return a tuple instead
extracted_string* s = try_extract_string(buffer, buffer_size, offset, min_chars);
Expand All @@ -206,7 +206,7 @@ std::tuple<string, string, std::pair<int, int>, bool> try_extract_string_tuple(c
auto result = std::make_tuple(
s->get_string(),
s->get_type_string(),
std::pair<int, int>(s->get_offset_start(), s->get_offset_end()),
std::pair<size_t, size_t>(s->get_offset_start(), s->get_offset_end()),
is_interesting
);

Expand All @@ -220,11 +220,11 @@ std::tuple<string, string, std::pair<int, int>, bool> try_extract_string_tuple(c
}


vector<std::tuple<string, string, std::pair<int, int>, bool>> extract_all_strings(const unsigned char buffer[], size_t buffer_size, size_t min_chars, bool only_interesting)
vector<std::tuple<string, string, std::pair<size_t, size_t>, bool>> extract_all_strings(const unsigned char buffer[], size_t buffer_size, size_t min_chars, bool only_interesting)
{
// Process the specified binary buffer and extract all strings
long offset = 0;
vector<std::tuple<string, string, std::pair<int, int>, bool>> r_vect;
size_t offset = 0;
vector<std::tuple<string, string, std::pair<size_t, size_t>, bool>> r_vect;
vector<float> proba_interesting_vect;
vector<float> proba_interesting_avg_vect;
extracted_string* s;
Expand All @@ -242,10 +242,10 @@ vector<std::tuple<string, string, std::pair<int, int>, bool>> extract_all_string

// Add the new string
r_vect.push_back(
tuple<string, string, std::pair<int, int>, bool>(
tuple<string, string, std::pair<size_t, size_t>, bool>(
s->get_string(),
s->get_type_string(),
std::pair<int, int>(s->get_offset_start(), s->get_offset_end()),
std::pair<size_t, size_t>(s->get_offset_start(), s->get_offset_end()),
proba_interesting > 0.5
)
);
Expand All @@ -268,7 +268,7 @@ vector<std::tuple<string, string, std::pair<int, int>, bool>> extract_all_string
proba_interesting_vect.push_back(proba_interesting);

// Advance by the byte-length of the string
offset += (long)s->get_size_in_bytes();
offset += s->get_size_in_bytes();

// Cleanup
delete s;
Expand All @@ -280,7 +280,7 @@ vector<std::tuple<string, string, std::pair<int, int>, bool>> extract_all_string
}

// Have a pass through the strings averaging the interestingness and filtering
vector<std::tuple<string, string, std::pair<int, int>, bool>> r_vect_filt;
vector<std::tuple<string, string, std::pair<size_t, size_t>, bool>> r_vect_filt;
for (int i = 0; i < r_vect.size(); i++)
{
// Get the interestingness
Expand All @@ -298,7 +298,7 @@ vector<std::tuple<string, string, std::pair<int, int>, bool>> extract_all_string
if (!only_interesting || proba_interesting_avg >= 0.2 || proba_interesting_vect[i] >= 0.5)
{
r_vect_filt.push_back(
tuple<string, string, std::pair<int, int>, bool>(
tuple<string, string, std::pair<size_t, size_t>, bool>(
std::get<0>(r_vect[i]),
std::get<1>(r_vect[i]),
std::get<2>(r_vect[i]),
Expand Down
8 changes: 4 additions & 4 deletions src/binary2strings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,12 @@ static std::unordered_set<char16_t> is_seen_commoncrawl({
}
);

size_t try_utf8_char_step(const unsigned char* buffer, size_t buffer_size, long offset);
size_t try_utf8_char_step(const unsigned char* buffer, size_t buffer_size, size_t offset);

int get_language_group(char16_t c);

extracted_string* try_extract_string(const unsigned char* buffer, size_t buffer_size, long offset, size_t min_chars);
extracted_string* try_extract_string(const unsigned char* buffer, size_t buffer_size, size_t offset, size_t min_chars);

std::tuple<string, string, std::pair<int, int>, bool> try_extract_string_tuple(const unsigned char* buffer, size_t buffer_size, long offset, size_t min_chars, bool only_interesting);
std::tuple<string, string, std::pair<size_t, size_t>, bool> try_extract_string_tuple(const unsigned char* buffer, size_t buffer_size, size_t offset, size_t min_chars, bool only_interesting);

vector<std::tuple<string, string, std::pair<int, int>, bool>> extract_all_strings(const unsigned char buffer[], size_t buffer_size, size_t min_chars, bool only_interesting);
vector<std::tuple<string, string, std::pair<size_t, size_t>, bool>> extract_all_strings(const unsigned char buffer[], size_t buffer_size, size_t min_chars, bool only_interesting);
11 changes: 6 additions & 5 deletions src/extracted_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,19 @@

using namespace std;

#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> _conv16;

extracted_string::extracted_string()
{
m_type = TYPE_UNDETERMINED;
m_string = (std::string)NULL;
m_string = "";
m_size_in_bytes = 0;
m_offset_start = 0;
m_offset_end = 0;
}

extracted_string::extracted_string(const char* string, size_t size_in_bytes, STRING_TYPE type, int offset_start, int offset_end)
extracted_string::extracted_string(const char* string, size_t size_in_bytes, STRING_TYPE type, size_t offset_start, size_t offset_end)
{
m_type = type;
m_string = std::string(string, size_in_bytes);
Expand All @@ -23,7 +24,7 @@ extracted_string::extracted_string(const char* string, size_t size_in_bytes, STR
m_offset_end = offset_end;
}

extracted_string::extracted_string(const char16_t* string, size_t size_in_bytes, STRING_TYPE type, int offset_start, int offset_end)
extracted_string::extracted_string(const char16_t* string, size_t size_in_bytes, STRING_TYPE type, size_t offset_start, size_t offset_end)
{
m_type = type;

Expand Down Expand Up @@ -126,12 +127,12 @@ string extracted_string::get_type_string()
}
}

int extracted_string::get_offset_start()
size_t extracted_string::get_offset_start()
{
return m_offset_start;
}

int extracted_string::get_offset_end()
size_t extracted_string::get_offset_end()
{
return m_offset_end;
}
Expand Down
13 changes: 7 additions & 6 deletions src/extracted_string.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Class for extracted strings
#pragma once
#include <string>
#define _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS
#include <codecvt>
#include "string_model.hpp"
#include <unordered_set>
Expand All @@ -24,21 +25,21 @@ class extracted_string
STRING_TYPE m_type;
std::string m_string; // Supports Utf8
size_t m_size_in_bytes;
int m_offset_start;
int m_offset_end;
size_t m_offset_start;
size_t m_offset_end;

public:
extracted_string();
extracted_string(const char* string, size_t size_in_bytes, STRING_TYPE type, int offset_start, int offset_end);
extracted_string(const char16_t* string, size_t size_in_bytes, STRING_TYPE type, int offset_start, int offset_end);
extracted_string(const char* string, size_t size_in_bytes, STRING_TYPE type, size_t offset_start, size_t offset_end);
extracted_string(const char16_t* string, size_t size_in_bytes, STRING_TYPE type, size_t offset_start, size_t offset_end);

float get_proba_interesting();
size_t get_size_in_bytes();
string get_string();
STRING_TYPE get_type();
string get_type_string();
int get_offset_start();
int get_offset_end();
size_t get_offset_start();
size_t get_offset_end();

bool is_interesting();

Expand Down
Loading

0 comments on commit 080a585

Please sign in to comment.