- UID
- 536
- 精华
- 积分
- 221
- 威望
- 点
- 宅币
- 个
- 贡献
- 次
- 宅之契约
- 份
- 最后登录
- 1970-1-1
- 在线时间
- 小时
|
技术博客http://www.cnblogs.com/itdef/ http://blog.csdn.net/stecdeng 技术交流群 群号码:324164944 欢迎c c++ windows驱动爱好者 服务器程序员沟通交流
本文是学习 多线程服务端编程的练习
书籍作者陈硕的博客也有提到这个题目
http://blog.csdn.net/solstice/article/details/8497475
第一个层次很简单 单机 一个小文件 读进来进行处理 然后对每个单词进行统计排序 记录每个单词出现频率- #include <algorithm>
- #include <iostream>
- #include <unordered_map>
- #include <vector>
- #include <sstream>
- #include <fstream>
-
- using namespace std;
-
- std::unordered_map<std::string, int> mapStringFrequent;
-
-
- void SplitStrig(std::string& line, std::vector<std::string>& vecString) {
- typedef std::string::size_type strSize;
- strSize i = 0;
- while (i != line.size()) {
- while (i != line.size() && isspace(line[i]))
- ++i;
- strSize j = i;
- while (j != line.size() && !isspace(line[j]))
- ++j;
- if (i != j) {
- vecString.push_back(line.substr(i, j - i));
- i = j;
- }
- }
- }
-
- bool CheckWord(std::string s) {
- typedef std::string::size_type strSize;
- strSize i = 0;
- while (i != s.size()) {
- if (isalpha(s[i])) {
- i++;
- }
- else {
- return false;
- }
- }
- return true;
- }
-
-
-
- void VecInputMap(std::vector<std::string>& vecString, std::unordered_map<std::string, int>& mapStringFrequent) {
- for (auto it : vecString) {
- if (CheckWord(it)) {
- transform(it.begin(), it.end(), it.begin(), ::tolower);
- mapStringFrequent[it]++;
- }
- }
- }
-
-
- bool ReadFile(const string& fileName) {
- bool ret = false;
- std::ifstream in(fileName);
- if (in.bad() || !in) {
- return ret;
- }
- std::string line;
- std::vector<std::string> vecString;
- try {
- while (getline(in, line))
- {
- //std::cout << line << std::endl;
- SplitStrig(line, vecString);
- }
-
- VecInputMap(vecString, mapStringFrequent);
- }
- catch (std::exception& e) {
- std::cout << line << std::endl;
- in.close();
- std::cerr << e.what() << std::endl;
- return ret;
- }
-
- in.close();
- ret = true;
- return ret;
- }
-
-
- void WriteToFile(const string& fileName, std::vector<std::pair<int, std::string>>& vecWordFreq) {
- std::ofstream out(fileName);
- if (out.bad() || !out) {
- return;
- }
-
- for (auto& it : vecWordFreq) {
- out << it.second << '\t' << it.first << std::endl;
- }
- out.close();
-
- }
- //============================================================
-
- int main()
- {
- bool ret = ReadFile("1.txt");
- if (!ret)
- return -1;
-
- //ret = ReadFile("2.txt");
- //if (!ret)
- // return -1;
- //ret = ReadFile("3.txt");
- //if (!ret)
- // return -1;
- //ret = ReadFile("4.txt");
- //if (!ret)
- // return -1;
-
- //ret = ReadFile("5.txt");
- //if (!ret)
- // return -1;
-
- //ret = ReadFile("6.txt");
- //if (!ret)
- // return -1;
-
- //ret = ReadFile("7.txt");
- //if (!ret)
- // return -1;
-
- //ret = ReadFile("8.txt");
- //if (!ret)
- // return -1;
-
- //ret = ReadFile("9.txt");
- //if (!ret)
- // return -1;
-
- //ret = ReadFile("10.txt");
- //if (!ret)
- // return -1;
-
- std::vector<std::pair<int, std::string>> freq;
- freq.reserve(mapStringFrequent.size());
-
- for (auto& it : mapStringFrequent){
- freq.push_back(make_pair(it.second, it.first));
- }
-
- std::sort(freq.begin(), freq.end(), [](const std::pair<int, std::string>& lhs, // const auto& lhs in C++14
- const std::pair<int, std::string>& rhs) {
- return lhs.first > rhs.first;
- });
-
- //for (auto it : freq) {
- // std::cout << it.first << '\t' << it.second << '\n';
- //}
-
- WriteToFile("freqResult.txt", freq);
- return 0;
- }
复制代码 第二个层次 就是文件较大 单词量较多 如果一次性读入并使用vector容器存放 以及使用hash容器进行统计频率 会出现内存等资源不足现象
那么就依次读取进内存 将解析的单词存放进vector中,当vector中的容量到达一个阈值 将vector中的单词放进map容器中统计单词出现频率 vector容器清空
但是map容器有多个 各个单词存放进那个map容器根据hash函数决定
当map容器的容量达到阈值后 写入文件 map容器清空
这样我们就得到多个记录单词出现频率的文本 供后继步骤分析合并 但是一个单词的出现频率肯定只出现在一个记录文本中(因为相同的单词具有相同的hash值 对应同一个记录文本)
如图
- #include <algorithm>
- #include <iostream>
- #include <unordered_map>
- #include <vector>
- #include <sstream>
- #include <fstream>
-
- using namespace std;
-
- #define BUCKET_NUM 10
- std::unordered_map<std::string, int> mapStringFrequent[BUCKET_NUM];
- std::ofstream out[BUCKET_NUM];
-
-
- #define MAX_WORD_SIZE 1024*1024
-
-
- void SplitStrig(std::string& line, std::vector<std::string>& vecString) {
- typedef std::string::size_type strSize;
- strSize i = 0;
- while (i != line.size()) {
- while (i != line.size() && isspace(line[i]))
- ++i;
- strSize j = i;
- while (j != line.size() && !isspace(line[j]))
- ++j;
- if (i != j) {
- vecString.push_back(line.substr(i, j - i));
- i = j;
- }
- }
- }
-
- bool CheckWord(std::string s) {
- typedef std::string::size_type strSize;
- strSize i = 0;
- while (i != s.size()) {
- if (isalpha(s[i])) {
- i++;
- }
- else {
- return false;
- }
- }
- return true;
- }
-
- void VecInputMap(std::vector<std::string>& vecString, std::unordered_map<std::string, int> mapStringFrequent[]) {
- for (auto it : vecString) {
- if (CheckWord(it)) {
- transform(it.begin(), it.end(), it.begin(), ::tolower);
- int idx = std::hash<string>()(it) % BUCKET_NUM;
- mapStringFrequent[idx][it]++;
- if (mapStringFrequent[idx].size() >= MAX_WORD_SIZE) {
- for (auto& it : mapStringFrequent[idx]) {
- out[idx] << it.second << '\t' << it.first << std::endl;
- }
- mapStringFrequent[idx].clear();
- }
- }
- }
- }
-
- bool ReadFile(const string& fileName) {
- bool ret = false;
- std::ifstream in(fileName);
- if (in.bad() || !in) {
- return ret;
- }
- std::string line;
- std::vector<std::string> vecString;
-
- try {
- while (getline(in, line))
- {
- SplitStrig(line, vecString);
- //std::cout << line << std::endl;
- if (vecString.size() >= MAX_WORD_SIZE) {
- //std::cout << "too many words" << std::endl;
- VecInputMap(vecString, mapStringFrequent);
- vecString.clear();
- }
- }
-
- for (int i = 0; i < BUCKET_NUM; i++) {
- for (auto& it : mapStringFrequent[i]) {
- out[i] << it.second << '\t' << it.first << std::endl;
- }
- mapStringFrequent[i].clear();
- }
-
- }
- catch (std::exception& e) {
- in.close();
- std::cerr << e.what() << std::endl;
- return ret;
- }
-
- std::cout << vecString.size() << std::endl;
- ret = true;
- return ret;
- }
-
- int main()
- {
- std::string fileName = "0bucket.txt";
- for (int i = 0; i < BUCKET_NUM; i++) {
- out[i].close();
- out[i].open(fileName);
- fileName[0]++;
- }
-
- ReadFile("LargeFile.txt");
-
-
- return 0;
- }
复制代码 运行效果如图
运行前
运行后
|
|