使用标准库：文本查询程序

我们将实现一个简单的文本查询程序，我们的程序允许用户在一个给定文件中查询单词。查询结果是单词在文件中出现的次数及其所在行的列表。如果一个单词在一行中出现多次，此行只列出一次。行会按照升序输出。

我们可以直接使用vector 、set和map来直接编写文本查询程序，我们在这里经定义一个抽象的解决方案。首先我们将会定义一个保存输入文件的类，它包含一个vector和map。vector用来保存输入文件的文本，map用来关联每个单词和他出现的行号的set。

1、直接使用vector、set 和map来编写文本查询程序

#include<iostream>
#include<fstream>
#include<sstream>
#include<vector>
#include<map>
#include<set>

using namespace std;

using line_no = vector<string>::size_type;

/*file和wm都定义成全局变量，便于在函数间共享*/
vector<string> file;          //文件每行的内容
map<string, set<line_no>>wm;  //单词到行号set的映射

string cleanup_str(const string &word)
{
    string ret;
    for (auto it = word.begin(); it != word.end(); ++it)
    {
        if (!ispunct(*it))
            ret += tolower(*it);
    }
    return ret;
}

void input_text(ifstream &is)
{
    string text;
    while (getline(is, text))
    {
        file.push_back(text);
        int n = file.size() - 1;
        istringstream line(text);
        string word;
        while (line >> word)
        {
            wm[cleanup_str(word)].insert(n);
        }
    }
}

ostream &query_and_print(const string &sought, ostream&os)
{
    auto loc = wm.find(sought);
    if (loc == wm.end())
    {
        os << sought << "出现了0次" << endl;
    }
    else
    {
        auto lines = loc->second;
        os << sought << " 出现了 " << lines.size() << "次" << endl;
        for (auto num : lines)
        {
            os << "\t(第" << num + 1 << "行）" << *(file.begin() + num) << endl;
        }
    }
    return os;
}

void runqueries(ifstream& infile)
{
    input_text(infile);

    while (1)
    {
        cout << "enter word to look for, or q to quit: ";
        string s;

        if (!(cin >> s) || s == "q")
            break;
        query_and_print(s, cout) << endl;
    }
}

int main()
{
    ifstream in("file.txt");
    if (in) {
        runqueries(in);
    }
    else
        cout << "couldn't open: " << endl;
    return 0;
}

1、我们使用一个vector来保存整个文件的一份拷贝。
2、使用一个istringstream来将每行分解为单词。
3、使用一个set来保存每个单词在输入文本中出现的行号，保证了每行只出现一次，且按升序保存。
4、最后使用一个map就将单词与其对应的行号关联起来。
结果如下：
这里写图片描述

2、使用面向对象的思维来实现
首先我们从定义一个保存输入文件的类开始：

#pragma once
#include<map>
#include<string>
#include<memory>
#include<vector>
#include<fstream>
#include<set>
#include<sstream>
#include<iostream>

using namespace std;

class QueryResult;   //这句声明必须要
class Textquery {
public:
    using line_no = vector<string>::size_type;
    Textquery(ifstream&);
    QueryResult query(const string&) const;
private:
    shared_ptr<vector<string>> file;//输入文件
    map<string, shared_ptr<set<line_no>>> wm;//每个单词到他所在行号的映射
};

我们定义的保存文件的类中包含两个私有成员一个vector和一个map，与上述方法一类似我们使用以个vector来保存输入的文本文件，map用来关联每个单词和他出现的行号set。
并且将定义一个构造函数，从给定的输入文件来构造此对象如下：


#include "Textquery.h"


Textquery::Textquery(ifstream&is) :file(new vector<string>)
{
    string text;
    while (getline(is, text))
    {
        //cout << text << endl;
        file->push_back(text);         //保存文本中的每一行
        int n = file->size() - 1;      //当前行号
        istringstream line(text);
        string word;
        while (line >> word)              //处理行中的每一个单词
        {
            shared_ptr<set<line_no>> &lines = wm[word];
            if (!lines)                  //当我们第一次遇到这个单词时，此指针为空
                lines.reset(new set<line_no>);     //分配一个新的set
            lines->insert(n);     //将此行号插入set中。
        }
    }
}

除此之外，我们在此类中定义了一个成员函数来执行查询的操作。查询操作要完成的任务十分简单：查找map成员，检查给定单词是否出现。这个函数将要返回我们所查询的结果（及出现的次数，行号，以及每行的文本），为此我们定义另外一个类来保存我们的查询结果，并且这个类可以打印对应查询的内容。

#pragma once
#include "Textquery.h"

class QueryResult {
    friend ostream& print(ostream&, const QueryResult&);
public:
    QueryResult(string s, shared_ptr<set<Textquery::line_no>> p, shared_ptr<vector<string>> f) :
        sought(s), lines(p), file(f) {}
private:
    string sought;                                          //查询的单词
    shared_ptr<set<Textquery::line_no>> lines;               //出现的行号
    shared_ptr<vector<string>> file;                         //输入文件
};


ostream& print(ostream& os, const QueryResult&qr)
{
    os << qr.sought << " occurs " << qr.lines->size() << " "
        << "times" << endl;
    for (auto num : *qr.lines)
    {
        os << "\t(line" << num + 1 << ")" << *(qr.file->begin() + num) << endl;
    }
    return os;
}

QueryResult Textquery::query(const string &sought) const
{
    //如果没有找到sought，返回一个指向此set的指针
    static shared_ptr<set<line_no>> nodata(new set<line_no>);
    auto loc = wm.find(sought);
    if (loc == wm.end())
        return QueryResult(sought, nodata, file);
    else
        return QueryResult(sought, loc->second, file);

}

最后测试一下：


#include "Textquery.h"
#include"QueryResult.h"

void runqueries(ifstream& infile)
{
    Textquery tq(infile);

    while (1)
    {
        cout << "enter word to look for, or q to quit: ";
        string s;

        if (!(cin >> s) || s == "q")
            break;
        print(cout, tq.query(s)) << endl;
    }
}

int main()
{
    ifstream in("file.txt");
    if (in) {
        runqueries(in);
    }
    else
        cout << "couldn't open: " <<endl;
    return 0;
}

这里写图片描述

与方法一不同的是，我们要在不同的类对象间共享数据，我们使用了shared_ptr来管理。
说明：我们的QueryResult类要表达的查询结果。这些数据都保存在Textquery类型的对象中。因此我们必须确定如何访问他们。我们可以拷贝行号set，但这样可能会很耗时。此外，我们不希望拷贝vector，因为这可能会影响整个文件的拷贝。通常返回指向Textquery对象内部的指针，我们可以避免拷贝操作，但是如果Textquery对象在QueryResult对象之前被销毁，我们将使用非法的内存，因此我们可以是用shared_ptr来处理这种共享问题。

使用标准库：文本查询程序

猜你喜欢