取出现次数最多的K个数题目思路代码Ref

122 阅读 0 评论 81 点赞

我是靠谱客的博主虚拟钢铁侠，这篇文章主要介绍取出现次数最多的K个数题目思路代码Ref，现在分享给大家，希望可以做个参考。

题目

搜索引擎会通过日志文件把用户每次检索使用的所有检索串都记录下来，每个查询串的长度为1-255字节。
假设目前有一千万个记录（这些查询串的重复度比较高，虽然总数是1千万，但如果除去重复后，不超过3百万个。一个查询串的重复度越高，说明查询它的用户越多，也就是越热门。），请你统计最热门的10个查询串，要求使用的内存不能超过1G。

思路

第一步：用Hashmap（STL中叫unordered_map）统计词频

第二步：用容量为K的最小堆取出出现次数最大的K个词
（参考 http://blog.csdn.net/fuyufjh/article/details/48369801）

代码

复制代码

#include <iostream>
#include <vector>
#include <queue>
#include <functional>
#include <algorithm>
#include <cstdlib>
#include <ctime>
#include <map>
#include <string>
#include <unordered_map>
using namespace std;

typedef pair<string, int> Record;

struct RecordComparer {
    bool operator() (const Record &r1, const Record &r2) {
        return r1.second > r2.second;
    }
};

vector<Record> TopKNumbers(vector<string> &input, int k) {
    unordered_map<string, int> stat;
    for (const string &s : input) stat[s]++;
    priority_queue<Record, vector<Record>, RecordComparer> heap;
    auto iter = stat.begin();
    for (int i = 0; i < k && iter != stat.end(); i++, iter++) {
        heap.push(*iter);
    }
    for (; iter != stat.end(); iter++) {
        if (iter->second > heap.top().second) {
            heap.pop();
            heap.push(*iter);
        }
    }
    vector<Record> result;
    while (!heap.empty()) {
        result.push_back(heap.top());
        heap.pop();
    }
    return result;
}

/********  测试代码  *********/
int main() {
    clock_t cbegin, cend;
    vector<string> test;
    char buf[20];
    for (int i = 0; i < 100; i++) {
        int x = rand() % 20;
        sprintf(buf, "STR%d", x);
        test.push_back(string(buf));
    }
    auto result = TopKNumbers(test, 5);
    for (auto it = result.rbegin(); it != result.rend(); it++) {
        cout << it->first << 't' << it->second << endl;
    }
    printf("============================n");
    sort(test.begin(), test.end());
    for (const string &s : test) {
        cout << s << endl;
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#include <iostream>
#include <vector>
#include <queue>
#include <functional>
#include <algorithm>
#include <cstdlib>
#include <ctime>
#include <map>
#include <string>
#include <unordered_map>
using namespace std;

typedef pair<string, int> Record;

struct RecordComparer {
    bool operator() (const Record &r1, const Record &r2) {
        return r1.second > r2.second;
    }
};

vector<Record> TopKNumbers(vector<string> &input, int k) {
    unordered_map<string, int> stat;
    for (const string &s : input) stat[s]++;
    priority_queue<Record, vector<Record>, RecordComparer> heap;
    auto iter = stat.begin();
    for (int i = 0; i < k && iter != stat.end(); i++, iter++) {
        heap.push(*iter);
    }
    for (; iter != stat.end(); iter++) {
        if (iter->second > heap.top().second) {
            heap.pop();
            heap.push(*iter);
        }
    }
    vector<Record> result;
    while (!heap.empty()) {
        result.push_back(heap.top());
        heap.pop();
    }
    return result;
}

/********  测试代码  *********/
int main() {
    clock_t cbegin, cend;
    vector<string> test;
    char buf[20];
    for (int i = 0; i < 100; i++) {
        int x = rand() % 20;
        sprintf(buf, "STR%d", x);
        test.push_back(string(buf));
    }
    auto result = TopKNumbers(test, 5);
    for (auto it = result.rbegin(); it != result.rend(); it++) {
        cout << it->first << 't' << it->second << endl;
    }
    printf("============================n");
    sort(test.begin(), test.end());
    for (const string &s : test) {
        cout << s << endl;
    }
}