Skip to content

Commit

Permalink
Add support for multibyte utf encoding auto detection
Browse files Browse the repository at this point in the history
  • Loading branch information
variar committed Apr 25, 2016
1 parent 39d98e3 commit 16be497
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 228 deletions.
8 changes: 3 additions & 5 deletions glogg.pro
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ SOURCES += \
src/quickfindmux.cpp \
src/signalmux.cpp \
src/tabbedcrawlerwidget.cpp \
src/viewtools.cpp \
src/encodingspeculator.cpp
src/viewtools.cpp

INCLUDEPATH += src/

Expand Down Expand Up @@ -100,7 +99,6 @@ HEADERS += \
src/loadingstatus.h \
src/externalcom.h \
src/viewtools.h \
src/encodingspeculator.h \
src/data/atomicflag.h

isEmpty(BOOST_PATH) {
Expand Down Expand Up @@ -201,9 +199,9 @@ UI_DIR = $${OUT_PWD}/.ui/$${DESTDIR}-shared
#C++0x:QMAKE_CXXFLAGS += -std=c++0x
#C++11:QMAKE_CXXFLAGS += -std=c++11
CONFIG += c++11
release:QMAKE_CXXFLAGS += -O2
#release:QMAKE_CXXFLAGS += -O2
# Debug symbols even in release build
#QMAKE_CXXFLAGS += -g
QMAKE_CXXFLAGS += -g
GPROF {
QMAKE_CXXFLAGS += -pg
QMAKE_LFLAGS += -pg
Expand Down
26 changes: 6 additions & 20 deletions src/crawlerwidget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -967,40 +967,26 @@ void CrawlerWidget::updateEncoding()
static const char* latin1_encoding = "iso-8859-1";
static const char* utf8_encoding = "utf-8";

const char* encoding = latin1_encoding;
QTextCodec* textCodec = QTextCodec::codecForName(latin1_encoding);
encoding_text_ = tr( "Displayed as ISO-8859-1" );

switch ( encodingSetting_ ) {
case ENCODING_AUTO:
switch ( logData_->getDetectedEncoding() ) {
case EncodingSpeculator::Encoding::ASCII7:
encoding = latin1_encoding;
encoding_text_ = tr( "US-ASCII" );
break;
case EncodingSpeculator::Encoding::ASCII8:
encoding = latin1_encoding;
encoding_text_ = tr( "ISO-8859-1" );
break;
case EncodingSpeculator::Encoding::UTF8:
encoding = utf8_encoding;
encoding_text_ = tr( "UTF-8" );
break;
}
textCodec = logData_->getDetectedEncoding();
encoding_text_ = tr(textCodec->name().data());
break;
case ENCODING_UTF8:
encoding = utf8_encoding;
textCodec = QTextCodec::codecForName(utf8_encoding);
encoding_text_ = tr( "Displayed as UTF-8" );
break;
case ENCODING_ISO_8859_1:
default:
break;


}

logData_->setDisplayEncoding( encoding );
logData_->setDisplayEncoding( textCodec->name().data() );
logMainView->forceRefresh();
logFilteredData_->setDisplayEncoding( encoding );
logFilteredData_->setDisplayEncoding( textCodec->name().data() );
filteredView->forceRefresh();
}

Expand Down
2 changes: 1 addition & 1 deletion src/data/logdata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@ QStringList LogData::doGetExpandedLines( qint64 first_line, int number ) const
return list;
}

EncodingSpeculator::Encoding LogData::getDetectedEncoding() const
QTextCodec* LogData::getDetectedEncoding() const
{
return indexing_data_.getEncodingGuess();
}
2 changes: 1 addition & 1 deletion src/data/logdata.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class LogData : public AbstractLogData {
void setPollingInterval( uint32_t interval_ms );

// Get the auto-detected encoding for the indexed text.
EncodingSpeculator::Encoding getDetectedEncoding() const;
QTextCodec* getDetectedEncoding() const;

signals:
// Sent during the 'attach' process to signal progress
Expand Down
73 changes: 48 additions & 25 deletions src/data/logdataworkerthread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,15 @@
*/

#include <QFile>
#include <QTextStream>

#include "log.h"

#include "logdata.h"
#include "logdataworkerthread.h"

// Size of the chunk to read (5 MiB)
const int IndexOperation::sizeChunk = 5*1024*1024;
const int IndexOperation::sizeChunk = 1*1024*1024;

qint64 IndexingData::getSize() const
{
Expand Down Expand Up @@ -55,16 +56,16 @@ qint64 IndexingData::getPosForLine( LineNumber line ) const
return linePosition_.at( line );
}

EncodingSpeculator::Encoding IndexingData::getEncodingGuess() const
QTextCodec *IndexingData::getEncodingGuess() const
{
QMutexLocker locker( &dataMutex_ );

return encoding_;
}

void IndexingData::addAll( qint64 size, int length,
void IndexingData::addAll(qint64 size, int length,
const FastLinePositionArray& linePosition,
EncodingSpeculator::Encoding encoding )
QTextCodec *encoding )

{
QMutexLocker locker( &dataMutex_ );
Expand All @@ -81,7 +82,7 @@ void IndexingData::clear()
maxLength_ = 0;
indexedSize_ = 0;
linePosition_ = LinePositionArray();
encoding_ = EncodingSpeculator::Encoding::ASCII7;
encoding_ = QTextCodec::codecForLocale();
}

LogDataWorkerThread::LogDataWorkerThread( IndexingData* indexing_data )
Expand Down Expand Up @@ -120,7 +121,7 @@ void LogDataWorkerThread::indexAll()

interruptRequested_.clear();
operationRequested_ = new FullIndexOperation( fileName_,
indexing_data_, &interruptRequested_, &encodingSpeculator_ );
indexing_data_, &interruptRequested_ );
operationRequestedCond_.wakeAll();
}

Expand All @@ -136,7 +137,7 @@ void LogDataWorkerThread::indexAdditionalLines( qint64 position )

interruptRequested_.clear();
operationRequested_ = new PartialIndexOperation( fileName_,
indexing_data_, &interruptRequested_, &encodingSpeculator_, position );
indexing_data_, &interruptRequested_, position );
operationRequestedCond_.wakeAll();
}

Expand Down Expand Up @@ -191,31 +192,30 @@ void LogDataWorkerThread::run()
//

IndexOperation::IndexOperation( const QString& fileName,
IndexingData* indexingData, AtomicFlag* interruptRequest,
EncodingSpeculator* encodingSpeculator )
IndexingData* indexingData, AtomicFlag* interruptRequest)
: fileName_( fileName )
{
interruptRequest_ = interruptRequest;
indexing_data_ = indexingData;
encoding_speculator_ = encodingSpeculator;
}

PartialIndexOperation::PartialIndexOperation( const QString& fileName,
IndexingData* indexingData, AtomicFlag* interruptRequest,
EncodingSpeculator* speculator, qint64 position )
: IndexOperation( fileName, indexingData, interruptRequest, speculator )
IndexingData* indexingData, AtomicFlag* interruptRequest, qint64 position )
: IndexOperation( fileName, indexingData, interruptRequest )
{
initialPosition_ = position;
}

void IndexOperation::doIndex( IndexingData* indexing_data,
EncodingSpeculator* encoding_speculator, qint64 initialPosition )
void IndexOperation::doIndex(IndexingData* indexing_data, qint64 initialPosition )
{
qint64 pos = initialPosition; // Absolute position of the start of current line
qint64 end = 0; // Absolute position of the end of current line
int additional_spaces = 0; // Additional spaces due to tabs

QTextCodec* fileTextCodec= nullptr;

QFile file( fileName_ );

if ( file.open( QIODevice::ReadOnly ) ) {
// Count the number of lines and max length
// (read big chunks to speed up reading from disk)
Expand All @@ -231,23 +231,44 @@ void IndexOperation::doIndex( IndexingData* indexing_data,
const qint64 block_beginning = file.pos();
const QByteArray block = file.read( sizeChunk );

if (!fileTextCodec) {
fileTextCodec = QTextCodec::codecForUtfText(block);
}

quint8 charWidth = 1;
switch(fileTextCodec->mibEnum())
{
case 1013:
case 1014:
case 1015:
charWidth = 2;
break;
case 1017:
case 1018:
case 1019:
charWidth = 4;
}

// Count the number of lines in each chunk
qint64 pos_within_block = 0;
while ( pos_within_block != -1 ) {
pos_within_block = qMax( pos - block_beginning, 0LL);

// Looking for the next \n, expanding tabs in the process
do {
if ( pos_within_block < block.length() ) {
const char c = block.at(pos_within_block);
encoding_speculator->inject_byte( c );
if ( c == '\n' )

const char c1 = block.at(pos_within_block);
const char c2 = block.at(pos_within_block + charWidth - 1);

if ( c1 == '\n' || c2 == '\n')
break;
else if ( c == '\t' )
else if ( c1 == '\t' || c2 == '\t' )
additional_spaces += AbstractLogData::tabStop -
( ( ( block_beginning - pos ) + pos_within_block
+ additional_spaces ) % AbstractLogData::tabStop ) - 1;

pos_within_block++;
pos_within_block += charWidth;
}
else {
pos_within_block = -1;
Expand All @@ -260,15 +281,16 @@ void IndexOperation::doIndex( IndexingData* indexing_data,
const int length = end-pos + additional_spaces;
if ( length > max_length )
max_length = length;
pos = end + 1;

pos = end + charWidth;
additional_spaces = 0;
line_positions.append( pos );
}
}

// Update the shared data
indexing_data->addAll( block.length(), max_length, line_positions,
encoding_speculator->guess() );
fileTextCodec );

// Update the caller for progress indication
int progress = ( file.size() > 0 ) ? pos*100 / file.size() : 100;
Expand All @@ -285,7 +307,7 @@ void IndexOperation::doIndex( IndexingData* indexing_data,
line_position.append( file_size + 1 );
line_position.setFakeFinalLF();

indexing_data->addAll( 0, 0, line_position, encoding_speculator->guess() );
indexing_data->addAll( 0, 0, line_position, fileTextCodec );
}
}
else {
Expand All @@ -295,6 +317,7 @@ void IndexOperation::doIndex( IndexingData* indexing_data,

emit indexingProgressed( 100 );
}
LOG(logINFO) << "Detected encoding " << fileTextCodec->name().data();
}

// Called in the worker thread's context
Expand All @@ -310,7 +333,7 @@ bool FullIndexOperation::start()
// First empty the index
indexing_data_->clear();

doIndex( indexing_data_, encoding_speculator_, 0 );
doIndex( indexing_data_, 0 );

LOG(logDEBUG) << "FullIndexOperation: ... finished counting."
"interrupt = " << static_cast<bool>(*interruptRequest_);
Expand All @@ -328,7 +351,7 @@ bool PartialIndexOperation::start()

emit indexingProgressed( 0 );

doIndex( indexing_data_, encoding_speculator_, initialPosition_ );
doIndex( indexing_data_, initialPosition_ );

LOG(logDEBUG) << "PartialIndexOperation: ... finished counting.";

Expand Down
Loading

0 comments on commit 16be497

Please sign in to comment.