From 16be497937aba5f2b2734a22063bfbde76fdef4b Mon Sep 17 00:00:00 2001 From: Anton Filimonov Date: Tue, 26 Apr 2016 02:28:36 +0300 Subject: [PATCH] Add support for multibyte utf encoding auto detection --- glogg.pro | 8 +-- src/crawlerwidget.cpp | 26 ++------- src/data/logdata.cpp | 2 +- src/data/logdata.h | 2 +- src/data/logdataworkerthread.cpp | 73 +++++++++++++++-------- src/data/logdataworkerthread.h | 31 ++++------ src/encodingspeculator.cpp | 99 -------------------------------- src/encodingspeculator.h | 58 ------------------- 8 files changed, 71 insertions(+), 228 deletions(-) delete mode 100644 src/encodingspeculator.cpp delete mode 100644 src/encodingspeculator.h diff --git a/glogg.pro b/glogg.pro index 02c471812..7a2a2eb81 100644 --- a/glogg.pro +++ b/glogg.pro @@ -52,8 +52,7 @@ SOURCES += \ src/quickfindmux.cpp \ src/signalmux.cpp \ src/tabbedcrawlerwidget.cpp \ - src/viewtools.cpp \ - src/encodingspeculator.cpp + src/viewtools.cpp INCLUDEPATH += src/ @@ -100,7 +99,6 @@ HEADERS += \ src/loadingstatus.h \ src/externalcom.h \ src/viewtools.h \ - src/encodingspeculator.h \ src/data/atomicflag.h isEmpty(BOOST_PATH) { @@ -201,9 +199,9 @@ UI_DIR = $${OUT_PWD}/.ui/$${DESTDIR}-shared #C++0x:QMAKE_CXXFLAGS += -std=c++0x #C++11:QMAKE_CXXFLAGS += -std=c++11 CONFIG += c++11 -release:QMAKE_CXXFLAGS += -O2 +#release:QMAKE_CXXFLAGS += -O2 # Debug symbols even in release build -#QMAKE_CXXFLAGS += -g +QMAKE_CXXFLAGS += -g GPROF { QMAKE_CXXFLAGS += -pg QMAKE_LFLAGS += -pg diff --git a/src/crawlerwidget.cpp b/src/crawlerwidget.cpp index bee9c5180..a07b49a04 100644 --- a/src/crawlerwidget.cpp +++ b/src/crawlerwidget.cpp @@ -967,40 +967,26 @@ void CrawlerWidget::updateEncoding() static const char* latin1_encoding = "iso-8859-1"; static const char* utf8_encoding = "utf-8"; - const char* encoding = latin1_encoding; + QTextCodec* textCodec = QTextCodec::codecForName(latin1_encoding); encoding_text_ = tr( "Displayed as ISO-8859-1" ); switch ( encodingSetting_ ) { case ENCODING_AUTO: - switch ( logData_->getDetectedEncoding() ) { - case EncodingSpeculator::Encoding::ASCII7: - encoding = latin1_encoding; - encoding_text_ = tr( "US-ASCII" ); - break; - case EncodingSpeculator::Encoding::ASCII8: - encoding = latin1_encoding; - encoding_text_ = tr( "ISO-8859-1" ); - break; - case EncodingSpeculator::Encoding::UTF8: - encoding = utf8_encoding; - encoding_text_ = tr( "UTF-8" ); - break; - } + textCodec = logData_->getDetectedEncoding(); + encoding_text_ = tr(textCodec->name().data()); break; case ENCODING_UTF8: - encoding = utf8_encoding; + textCodec = QTextCodec::codecForName(utf8_encoding); encoding_text_ = tr( "Displayed as UTF-8" ); break; case ENCODING_ISO_8859_1: default: break; - - } - logData_->setDisplayEncoding( encoding ); + logData_->setDisplayEncoding( textCodec->name().data() ); logMainView->forceRefresh(); - logFilteredData_->setDisplayEncoding( encoding ); + logFilteredData_->setDisplayEncoding( textCodec->name().data() ); filteredView->forceRefresh(); } diff --git a/src/data/logdata.cpp b/src/data/logdata.cpp index 9a97a4a06..b12b8f192 100644 --- a/src/data/logdata.cpp +++ b/src/data/logdata.cpp @@ -415,7 +415,7 @@ QStringList LogData::doGetExpandedLines( qint64 first_line, int number ) const return list; } -EncodingSpeculator::Encoding LogData::getDetectedEncoding() const +QTextCodec* LogData::getDetectedEncoding() const { return indexing_data_.getEncodingGuess(); } diff --git a/src/data/logdata.h b/src/data/logdata.h index daf14e7f4..cebb22f6d 100644 --- a/src/data/logdata.h +++ b/src/data/logdata.h @@ -79,7 +79,7 @@ class LogData : public AbstractLogData { void setPollingInterval( uint32_t interval_ms ); // Get the auto-detected encoding for the indexed text. - EncodingSpeculator::Encoding getDetectedEncoding() const; + QTextCodec* getDetectedEncoding() const; signals: // Sent during the 'attach' process to signal progress diff --git a/src/data/logdataworkerthread.cpp b/src/data/logdataworkerthread.cpp index dce92bb47..ae48a6409 100644 --- a/src/data/logdataworkerthread.cpp +++ b/src/data/logdataworkerthread.cpp @@ -18,6 +18,7 @@ */ #include +#include #include "log.h" @@ -25,7 +26,7 @@ #include "logdataworkerthread.h" // Size of the chunk to read (5 MiB) -const int IndexOperation::sizeChunk = 5*1024*1024; +const int IndexOperation::sizeChunk = 1*1024*1024; qint64 IndexingData::getSize() const { @@ -55,16 +56,16 @@ qint64 IndexingData::getPosForLine( LineNumber line ) const return linePosition_.at( line ); } -EncodingSpeculator::Encoding IndexingData::getEncodingGuess() const +QTextCodec *IndexingData::getEncodingGuess() const { QMutexLocker locker( &dataMutex_ ); return encoding_; } -void IndexingData::addAll( qint64 size, int length, +void IndexingData::addAll(qint64 size, int length, const FastLinePositionArray& linePosition, - EncodingSpeculator::Encoding encoding ) + QTextCodec *encoding ) { QMutexLocker locker( &dataMutex_ ); @@ -81,7 +82,7 @@ void IndexingData::clear() maxLength_ = 0; indexedSize_ = 0; linePosition_ = LinePositionArray(); - encoding_ = EncodingSpeculator::Encoding::ASCII7; + encoding_ = QTextCodec::codecForLocale(); } LogDataWorkerThread::LogDataWorkerThread( IndexingData* indexing_data ) @@ -120,7 +121,7 @@ void LogDataWorkerThread::indexAll() interruptRequested_.clear(); operationRequested_ = new FullIndexOperation( fileName_, - indexing_data_, &interruptRequested_, &encodingSpeculator_ ); + indexing_data_, &interruptRequested_ ); operationRequestedCond_.wakeAll(); } @@ -136,7 +137,7 @@ void LogDataWorkerThread::indexAdditionalLines( qint64 position ) interruptRequested_.clear(); operationRequested_ = new PartialIndexOperation( fileName_, - indexing_data_, &interruptRequested_, &encodingSpeculator_, position ); + indexing_data_, &interruptRequested_, position ); operationRequestedCond_.wakeAll(); } @@ -191,31 +192,30 @@ void LogDataWorkerThread::run() // IndexOperation::IndexOperation( const QString& fileName, - IndexingData* indexingData, AtomicFlag* interruptRequest, - EncodingSpeculator* encodingSpeculator ) + IndexingData* indexingData, AtomicFlag* interruptRequest) : fileName_( fileName ) { interruptRequest_ = interruptRequest; indexing_data_ = indexingData; - encoding_speculator_ = encodingSpeculator; } PartialIndexOperation::PartialIndexOperation( const QString& fileName, - IndexingData* indexingData, AtomicFlag* interruptRequest, - EncodingSpeculator* speculator, qint64 position ) - : IndexOperation( fileName, indexingData, interruptRequest, speculator ) + IndexingData* indexingData, AtomicFlag* interruptRequest, qint64 position ) + : IndexOperation( fileName, indexingData, interruptRequest ) { initialPosition_ = position; } -void IndexOperation::doIndex( IndexingData* indexing_data, - EncodingSpeculator* encoding_speculator, qint64 initialPosition ) +void IndexOperation::doIndex(IndexingData* indexing_data, qint64 initialPosition ) { qint64 pos = initialPosition; // Absolute position of the start of current line qint64 end = 0; // Absolute position of the end of current line int additional_spaces = 0; // Additional spaces due to tabs + QTextCodec* fileTextCodec= nullptr; + QFile file( fileName_ ); + if ( file.open( QIODevice::ReadOnly ) ) { // Count the number of lines and max length // (read big chunks to speed up reading from disk) @@ -231,23 +231,44 @@ void IndexOperation::doIndex( IndexingData* indexing_data, const qint64 block_beginning = file.pos(); const QByteArray block = file.read( sizeChunk ); + if (!fileTextCodec) { + fileTextCodec = QTextCodec::codecForUtfText(block); + } + + quint8 charWidth = 1; + switch(fileTextCodec->mibEnum()) + { + case 1013: + case 1014: + case 1015: + charWidth = 2; + break; + case 1017: + case 1018: + case 1019: + charWidth = 4; + } + // Count the number of lines in each chunk qint64 pos_within_block = 0; while ( pos_within_block != -1 ) { pos_within_block = qMax( pos - block_beginning, 0LL); + // Looking for the next \n, expanding tabs in the process do { if ( pos_within_block < block.length() ) { - const char c = block.at(pos_within_block); - encoding_speculator->inject_byte( c ); - if ( c == '\n' ) + + const char c1 = block.at(pos_within_block); + const char c2 = block.at(pos_within_block + charWidth - 1); + + if ( c1 == '\n' || c2 == '\n') break; - else if ( c == '\t' ) + else if ( c1 == '\t' || c2 == '\t' ) additional_spaces += AbstractLogData::tabStop - ( ( ( block_beginning - pos ) + pos_within_block + additional_spaces ) % AbstractLogData::tabStop ) - 1; - pos_within_block++; + pos_within_block += charWidth; } else { pos_within_block = -1; @@ -260,7 +281,8 @@ void IndexOperation::doIndex( IndexingData* indexing_data, const int length = end-pos + additional_spaces; if ( length > max_length ) max_length = length; - pos = end + 1; + + pos = end + charWidth; additional_spaces = 0; line_positions.append( pos ); } @@ -268,7 +290,7 @@ void IndexOperation::doIndex( IndexingData* indexing_data, // Update the shared data indexing_data->addAll( block.length(), max_length, line_positions, - encoding_speculator->guess() ); + fileTextCodec ); // Update the caller for progress indication int progress = ( file.size() > 0 ) ? pos*100 / file.size() : 100; @@ -285,7 +307,7 @@ void IndexOperation::doIndex( IndexingData* indexing_data, line_position.append( file_size + 1 ); line_position.setFakeFinalLF(); - indexing_data->addAll( 0, 0, line_position, encoding_speculator->guess() ); + indexing_data->addAll( 0, 0, line_position, fileTextCodec ); } } else { @@ -295,6 +317,7 @@ void IndexOperation::doIndex( IndexingData* indexing_data, emit indexingProgressed( 100 ); } + LOG(logINFO) << "Detected encoding " << fileTextCodec->name().data(); } // Called in the worker thread's context @@ -310,7 +333,7 @@ bool FullIndexOperation::start() // First empty the index indexing_data_->clear(); - doIndex( indexing_data_, encoding_speculator_, 0 ); + doIndex( indexing_data_, 0 ); LOG(logDEBUG) << "FullIndexOperation: ... finished counting." "interrupt = " << static_cast(*interruptRequest_); @@ -328,7 +351,7 @@ bool PartialIndexOperation::start() emit indexingProgressed( 0 ); - doIndex( indexing_data_, encoding_speculator_, initialPosition_ ); + doIndex( indexing_data_, initialPosition_ ); LOG(logDEBUG) << "PartialIndexOperation: ... finished counting."; diff --git a/src/data/logdataworkerthread.h b/src/data/logdataworkerthread.h index 29c32c762..7382ad80d 100644 --- a/src/data/logdataworkerthread.h +++ b/src/data/logdataworkerthread.h @@ -25,10 +25,11 @@ #include #include #include +#include #include "loadingstatus.h" #include "linepositionarray.h" -#include "encodingspeculator.h" + #include "utils.h" #include "atomicflag.h" @@ -37,7 +38,7 @@ class IndexingData { public: IndexingData() : dataMutex_(), linePosition_(), maxLength_(0), - indexedSize_(0), encoding_(EncodingSpeculator::Encoding::ASCII7) { } + indexedSize_(0), encoding_(QTextCodec::codecForLocale()) { } // Get the total indexed size qint64 getSize() const; @@ -53,13 +54,13 @@ class IndexingData qint64 getPosForLine( LineNumber line ) const; // Get the guessed encoding for the content. - EncodingSpeculator::Encoding getEncodingGuess() const; + QTextCodec* getEncodingGuess() const; // Atomically add to all the existing // indexing data. void addAll( qint64 size, int length, const FastLinePositionArray& linePosition, - EncodingSpeculator::Encoding encoding ); + QTextCodec* encoding ); // Completely clear the indexing data. void clear(); @@ -71,16 +72,15 @@ class IndexingData int maxLength_; qint64 indexedSize_; - EncodingSpeculator::Encoding encoding_; + QTextCodec* encoding_; }; class IndexOperation : public QObject { Q_OBJECT public: - IndexOperation( const QString& fileName, - IndexingData* indexingData, AtomicFlag* interruptRequest, - EncodingSpeculator* encodingSpeculator ); + IndexOperation(const QString& fileName, + IndexingData* indexingData, AtomicFlag* interruptRequest); virtual ~IndexOperation() { } @@ -96,23 +96,19 @@ class IndexOperation : public QObject // Returns the total size indexed // Modify the passed linePosition and maxLength - void doIndex( IndexingData* linePosition, EncodingSpeculator* encodingSpeculator, - qint64 initialPosition ); + void doIndex( IndexingData* linePosition, qint64 initialPosition ); QString fileName_; AtomicFlag* interruptRequest_; IndexingData* indexing_data_; - - EncodingSpeculator* encoding_speculator_; }; class FullIndexOperation : public IndexOperation { public: FullIndexOperation( const QString& fileName, - IndexingData* indexingData, AtomicFlag* interruptRequest, - EncodingSpeculator* speculator ) - : IndexOperation( fileName, indexingData, interruptRequest, speculator ) { } + IndexingData* indexingData, AtomicFlag* interruptRequest) + : IndexOperation( fileName, indexingData, interruptRequest ) { } virtual bool start(); }; @@ -120,7 +116,7 @@ class PartialIndexOperation : public IndexOperation { public: PartialIndexOperation( const QString& fileName, IndexingData* indexingData, - AtomicFlag* interruptRequest, EncodingSpeculator* speculator, qint64 position ); + AtomicFlag* interruptRequest, qint64 position ); virtual bool start(); private: @@ -185,9 +181,6 @@ class LogDataWorkerThread : public QThread // Pointer to the owner's indexing data (we modify it) IndexingData* indexing_data_; - - // To guess the encoding - EncodingSpeculator encodingSpeculator_; }; #endif diff --git a/src/encodingspeculator.cpp b/src/encodingspeculator.cpp deleted file mode 100644 index ebbd7ae64..000000000 --- a/src/encodingspeculator.cpp +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (C) 2016 Nicolas Bonnefon and other contributors - * - * This file is part of glogg. - * - * glogg is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * glogg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with glogg. If not, see . - */ - -#include "encodingspeculator.h" - -#include - -void EncodingSpeculator::inject_byte( uint8_t byte ) -{ - if ( ! ( byte & 0x80 ) ) { - // 7-bit character, all fine - } - else { - switch ( state_ ) { - case State::ASCIIOnly: - case State::ValidUTF8: - if ( ( byte & 0xE0 ) == 0xC0 ) { - state_ = State::UTF8LeadingByteSeen; - code_point_ = ( byte & 0x1F ) << 6; - continuation_left_ = 1; - min_value_ = 0x80; - // std::cout << "Lead: cp= " << std::hex << code_point_ << std::endl; - } - else if ( ( byte & 0xF0 ) == 0xE0 ) { - state_ = State::UTF8LeadingByteSeen; - code_point_ = ( byte & 0x0F ) << 12; - continuation_left_ = 2; - min_value_ = 0x800; - // std::cout << "Lead 3: cp= " << std::hex << code_point_ << std::endl; - } - else if ( ( byte & 0xF8 ) == 0xF0 ) { - state_ = State::UTF8LeadingByteSeen; - code_point_ = ( byte & 0x07 ) << 18; - continuation_left_ = 3; - min_value_ = 0x800; - // std::cout << "Lead 4: cp= " << std::hex << code_point_ << std::endl; - } - else { - state_ = State::Unknown8Bit; - } - break; - case State::UTF8LeadingByteSeen: - if ( ( byte & 0xC0 ) == 0x80 ) { - --continuation_left_; - code_point_ |= ( byte & 0x3F ) << (continuation_left_ * 6); - // std::cout << "Cont: cp= " << std::hex << code_point_ << std::endl; - if ( continuation_left_ == 0 ) { - if ( code_point_ >= min_value_ ) - state_ = State::ValidUTF8; - else - state_ = State::Unknown8Bit; - } - } - else { - state_ = State::Unknown8Bit; - } - break; - } - // state_ = State::Unknown8Bit; - } -} - -EncodingSpeculator::Encoding EncodingSpeculator::guess() const -{ - Encoding guess; - - switch ( state_ ) { - case State::ASCIIOnly: - guess = Encoding::ASCII7; - break; - case State::Unknown8Bit: - case State::UTF8LeadingByteSeen: - guess = Encoding::ASCII8; - break; - case State::ValidUTF8: - guess = Encoding::UTF8; - break; - default: - guess = Encoding::ASCII8; - } - - return guess; -} diff --git a/src/encodingspeculator.h b/src/encodingspeculator.h deleted file mode 100644 index 365843ce4..000000000 --- a/src/encodingspeculator.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (C) 2016 Nicolas Bonnefon and other contributors - * - * This file is part of glogg. - * - * glogg is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * glogg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with glogg. If not, see . - */ - -#ifndef ENCODINGSPECULATOR_H -#define ENCODINGSPECULATOR_H - -#include - -// The encoder speculator tries to determine the likely encoding -// of the stream of bytes which is passed to it. - -class EncodingSpeculator { - public: - enum class Encoding { - ASCII7, - ASCII8, - UTF8 - }; - - EncodingSpeculator() : state_( State::ASCIIOnly ) {} - - // Inject one byte into the speculator - void inject_byte( uint8_t byte ); - - // Returns the current guess based on the previously injected bytes - Encoding guess() const; - - private: - enum class State { - ASCIIOnly, - Unknown8Bit, - UTF8LeadingByteSeen, - ValidUTF8, - }; - - State state_; - uint32_t code_point_; - int continuation_left_; - uint32_t min_value_; -}; - -#endif