如何在处理西里尔文文本文件时提高 C++ 性能？答案

【问题标题】：How to increase C++ performance while processing cyrillic text files?如何在处理西里尔文文本文件时提高 C++ 性能？
【发布时间】：2021-11-19 18:07:26
【问题描述】：

我正在开发一个 C++ 程序来分析俄语文本中出现的字母。它按计划工作，但它的性能真的很差（比较我的其他 python 程序在近 10 秒内完成这项任务，而这个在大约 7 分钟内完成）。

所以我的问题是如何提高性能？我应该阅读什么来加深对这个问题的理解？

主要

#include <iostream>
#include <io.h>
#include <fcntl.h>
#include "text.h"

int main()
{
_setmode(_fileno(stdout), _O_U16TEXT);
Text_Container mytext("./text.txt");
mytext.initialize();
//std::wcout<< mytext.Display_text()<<std::endl;
mytext.print_dict(0);
mytext.print_dict(1);

return 0;
}

类.cpp


void Text_Container::print_map(std::wstring_view comment, const std::map<wchar_t, wchar_t>& m)
{
    std::wcout << comment;
    for (const auto& [key, value] : m) {
        std::wcout << key << L" = " << value << L"; ";
    }
    std::wcout << L"\n";
}
void Text_Container::print_dict(int mode)
{
    if (mode == 0) {
        std::wcout << "Dictionary with no whitespaces\n";
        for (const auto& [key, value] : _dict) {
            std::wcout << L"'" << key << L"' = " << value << L";\n";
        }
        std::wcout << L"\n";
    }
    if (mode == 1) {
        std::wcout << "Dictionary with whitespaces\n";
        for (const auto& [key, value] : _dict_w) {
            std::wcout << L"'" << key << L"' = " << value << L";\n";
        }
        std::wcout << L"\n";
    }
}
void Text_Container::read_file(const char* filename)
{
    std::wifstream wif(filename);
    wif.imbue(std::locale(std::locale::empty(), new std::codecvt_utf8<wchar_t>));
    std::wstringstream wss;
    wss << wif.rdbuf();
    _text = wss.str();
}

void Text_Container::initialize()
{
    /// <summary>
    /// lowercase text
    /// </summary>
    /// <param name="mode"></param>
    std::wstring new_s;
    for (std::wstring::size_type i = 0; i < _text.size(); i++) {
        wchar_t temp = iterate_over_map(_text[i]);
        if (temp != '\1')
        {   //whitespaces
            wchar_t temp0 = iterate_over_dictionary(_text[i], 0);
            if (temp0 != '\1')
            {
                _f_w_text = _f_w_text + temp0;
            };
            //no whitespaces
            wchar_t temp1 = iterate_over_dictionary(_text[i], 1);
            if (temp1 != '\1')
            {
                _f_text = _f_text + temp1;
            };
            new_s = new_s + temp;
        };
    }
    _text = new_s;
}
//this function pring russian letters to lowercase
wchar_t Text_Container::iterate_over_map(wchar_t& temp) {
    if (temp == L'Ё' || temp == L'Э' || temp == L'ё' || temp == L'э') {
        temp = L'е';
    }
    if (temp == L'Ъ' || temp == L'ъ') {
        temp = L'ь';
    }
    for (const auto& [key, value] : _m) {
        if (temp == value) { return value; }
        else { if (temp == key) { return value; } }
    }
    return L'\1';
}
//this fucntion verifies input letter and if it's in the selected dictionary increases value by 1
wchar_t Text_Container::iterate_over_dictionary(wchar_t& temp, int mode) {
    std::map<wchar_t, int>::iterator itr;
    if (mode == 0)
    {
        for (itr = _dict_w.begin(); itr != _dict_w.end(); ++itr) {
            if (itr->first == temp) {
                itr->second++;
                return itr->first;
            }
        }
    }
    if (mode == 1)
    {
        for (itr = _dict.begin(); itr != _dict.end(); ++itr) {
            if (itr->first == temp) {
                itr->second++;
                return itr->first;
            }
        }
    }
    return L'\1';
}

类.h

#define _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS
#include <string_view>
#include <sstream>
#include <fstream>
#include <codecvt>
#include <iostream>
#include <string>
#include <map>
#include <string_view>

class Text_Container
{
private:
    //dictionaries to filter text & also bring them to lowercase
    const std::map<wchar_t, wchar_t> _m{ {L'А', L'а'}, {L'Б', L'б'}, {L'В', L'в'},
                                            {L'Г', L'г'}, {L'Д', L'д'}, {L'Е', L'е'},
                                            {L'Ж', L'ж'}, {L'З', L'з'}, {L'И', L'и'},
                                            {L'Й', L'й'}, {L'К', L'к'}, {L'Л', L'л'},
                                            {L'М', L'м'}, {L'Н', L'н'}, {L'О', L'о'},
                                            {L'П', L'п'}, {L'Р', L'р'}, {L'С', L'с'},
                                            {L'Т', L'т'}, {L'У', L'у'}, {L'Ф', L'ф'},
                                            {L'Ч', L'ч'}, {L'Ц', L'ц'}, {L'Ш', L'ш'},
                                            {L'Щ', L'щ'}, {L'Ы', L'ы'}, {L'Ь', L'ь'},
                                            {L'Ю', L'ю'}, {L'Я', L'я'}, {L' ', L' '}, };
    /*
    std::map<wchar_t, int> _dict_w{ {L'а', 0}, {L'б', 0}, {L' ',0},}; //mode 0
    std::map<wchar_t, int> _dict{ {L'а', 0}, {L'б', 0},}; //mode 1
    */
    //mode 0
    std::map<wchar_t, int> _dict_w{ {L'а', 0}, {L'б', 0}, {L'в', 0},
                                            {L'г', 0}, {L'д', 0}, {L'е', 0},
                                            {L'ж', 0}, {L'з', 0}, {L'и', 0},
                                            {L'й', 0}, {L'к', 0}, {L'л', 0},
                                            {L'м', 0}, {L'н', 0}, {L'о', 0},
                                            {L'п', 0}, {L'р', 0}, {L'с', 0},
                                            {L'т', 0}, {L'у', 0}, {L'ф', 0},
                                            {L'ч', 0}, {L'ц', 0}, {L'ш', 0},
                                            {L'щ', 0}, {L'ы', 0}, {L'ь', 0},
                                            {L'ю', 0}, {L'я', 0}, {L' ', 0}, };

    //mode 1
    std::map<wchar_t, int> _dict{ {L'а', 0}, {L'б', 0}, {L'в', 0},
                                            {L'г', 0}, {L'д', 0}, {L'е', 0},
                                            {L'ж', 0}, {L'з', 0}, {L'и', 0},
                                            {L'й', 0}, {L'к', 0}, {L'л', 0},
                                            {L'м', 0}, {L'н', 0}, {L'о', 0},
                                            {L'п', 0}, {L'р', 0}, {L'с', 0},
                                            {L'т', 0}, {L'у', 0}, {L'ф', 0},
                                            {L'ч', 0}, {L'ц', 0}, {L'ш', 0},
                                            {L'щ', 0}, {L'ы', 0}, {L'ь', 0},
                                            {L'ю', 0}, {L'я', 0}, };
    //vars
    std::wstring _text = L""; // inintial text
    std::wstring _f_text = L""; //mode 1
    std::wstring _f_w_text = L"";// mode 0
    //methods
    wchar_t iterate_over_map(wchar_t& temp);
    wchar_t iterate_over_dictionary(wchar_t& temp, int mode);
public:
    //constructor
    Text_Container(const char* filename) { read_file(filename); };
    //destructor

    //methods
    std::wstring Display_text() { return _text; }
    std::map<wchar_t, wchar_t> give_m() { return _m; };

    void print_map(std::wstring_view comment, const std::map<wchar_t, wchar_t>& m);
    void read_file(const char* filename);
    void print_dict(int mode);
    void initialize();
};

【问题讨论】：

也许从 UTF 转换到 Windows-1251 是一个不错的决定？
你应该坚持使用 Unicode。读取额外的字节不需要时间。您的代码的哪一部分很慢，或者您认为它很慢？
您是否正在测量调试版本的性能？
@IgorTandetnik 我是，但我创建了一个发布版本，结果仍然很糟糕。
@BarmakShemirani 我的意思是摆脱 wstring wchar 的东西可能更容易，因为在我看来这些和 STL 的兼容性很差。

标签： c++ windows unicode stl c++17

【解决方案1】：

我已经完成了类似的事情，但是用文字。

#include <filesystem>
#include <iostream>
#include <fstream>
#include <map>
#include <cmath>
#include <chrono>
#include <algorithm>
#include <vector>
#include <execution>
#include <thread>
#include <condition_variable>
#include <mutex>
#include <string>
#include <atomic>


void brough(void *w);

std::vector<std::pair<std::string,int>> tfinal;

std::weak_ptr<std::atomic_int> com_r;
std::condition_variable com_cvp;
std::atomic<bool> com_tr, com_onemore;
std::mutex com_mx;
std::thread h, dest;

int instore;

int main() 
{
    std::mutex com_mp;
    std::thread h = std::thread(&brough, (void *)nullptr);

    do
    {
        instore = com_r.lock() ? (int)*com_r.lock() : instore;
        com_onemore = false;
        std::unique_lock<std::mutex> lk(com_mp);
        std::string rend(instore , 'x');
        rend = rend + '\r';
        std::cout << rend << std::flush;
        com_cvp.wait_for(lk ,std::chrono::milliseconds(900) ,[] { return com_tr == !false||com_onemore == !false; }); 
    }while(!com_tr);

    h.join();
}


void brough(void *w)
{
    std::filesystem::path tf("b.txt");
    std::ifstream b(tf);
    std::shared_ptr<std::atomic_int> sp_tray = std::shared_ptr<std::atomic_int>(new std::atomic_int(0));
    std::vector<std::string> c;

    com_r = sp_tray;
    do{
        b >> *c.insert(c.end(),"");
    }while (!b.eof());

    auto t1 = std::chrono::system_clock::now();
    std::vector<std::string> t;

    int counter = 0; //with the stuff below final result will be 25.0 vs 24.947 now
    //std::atomic_int int counter = 0;

    for(int ft = 0 ; ft < 15; ft++)
    {
        std::mutex mtb;
        std::string torque;
        int rd = 0;
        std::for_each(std::execution::par, c.begin(), c.end(), [&](auto a){ int wr = std::count(t.begin(), t.end(), a) == 0 ? std::count( c.begin(), c.end(), a) : 0;
        *sp_tray = 25*++counter/(15*c.size()); std::unique_lock<std::mutex> lb(mtb);  torque = wr > rd ? a : torque; rd = std::max(rd , wr); });
        t.insert(t.end(), torque);
        std::unique_lock<std::mutex> lt(com_mx);
        tfinal.insert(tfinal.end(), { torque , rd } );
        com_onemore = !false;
        com_cvp.notify_one();
    }
    std::cout << std::endl; 
    for(auto k : t)
        std::cout  << k << std::endl;
    std::cout << std::endl << "sp_tray = " <<  25.0*counter/(15*c.size()) << std::endl;
    auto t2 = std::chrono::system_clock::now();
    std::chrono::milliseconds f =  std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1);
    std::cout  << f.count() << std::endl;
    
    com_tr = !false;
    com_cvp.notify_one();

}

它有进度条和性能测试

制作文件：

CC = c++ -c
CFLAGS = -std=c++17 -O3
LINKER = c++
GUILIBS = -ltbb -lpthread

t: f.o
    $(LINKER) $(LFLAGS)  f.o  $(GUILIBS) -o t

f.o: f.cpp
    $(CC) $CFLAGS() f.cpp

【讨论】：

【解决方案2】：

嗯，这看起来像一个非常昂贵的操作。

std::wstringstream wss;
wss << wif.rdbuf();
_text = wss.str();

您将 while 文件读入字符串流 wss（所以这是一个巨大的副本）。如果文件很大，内部缓冲区可能会被扩展多次，每次都会强制将字符串的副本复制到新位置。

然后，一旦您构建了流，您就可以将其复制到字符串 _text 中。所以这是另一个巨大的副本。

看起来不需要任何这些。只需从wif 读取而不是从_text 读取。

【讨论】：

【解决方案3】：

有几个方面需要调查缓慢。从最大到最慢。

您是否需要迭代地图而不是使用 at/get？这可以带来巨大的性能提升

for (itr = _dict_w.begin(); itr != _dict_w.end(); ++itr) {
            if (itr->first == temp) {
                itr->second++;
                return itr->first;
            }
        }

还有：

wchar_t Text_Container::iterate_over_map(wchar_t& temp) {
    if (temp == L'Ё' || temp == L'Э' || temp == L'ё' || temp == L'э') {
        temp = L'е';
    }
    if (temp == L'Ъ' || temp == L'ъ') {
        temp = L'ь';
    }
    for (const auto& [key, value] : _m) {
        if (temp == value) { return value; }
        else { if (temp == key) { return value; } }
    }
    return L'\1';
}

让我们谈谈模板/数据类型，所有数据集都需要地图吗？

可以尝试使用枚举。

这对本节有很大帮助。特别是如果你加入一些位掩码

if (temp == L'Ё' || temp == L'Э' || temp == L'ё' || temp == L'э') {
    temp = L'е';
}
if (temp == L'Ъ' || temp == L'ъ') {
    temp = L'ь';
}
for (const auto& [key, value] : _m) {
    if (temp == value) { return value; }
    else { if (temp == key) { return value; } }
}
return L'\1';

如果您可以记录某些功能/流程所花费的时间，可能会很有见地。

读取一个字符需要多长时间？
读取 X 字符需要多长时间？

需要考虑的一些问题：

您的 RAM 是什么样的？
您正在阅读的文件有多大？
- 把它们切成片会有帮助吗？
按值传递与按引用/指针传递
- 通过引用传递并将函数返回到 void 可能会有所帮助。

最后，我想说我也不是这方面的专家。所以我很想看看别人怎么说。

【讨论】：