【问题标题】:Trouble reading from a file. Appears to reach EOF prematurely从文件中读取时出现问题。似乎过早地达到 EOF
【发布时间】:2023-12-04 22:07:01
【问题描述】:

我确信这个问题可以相对容易地解决,但我很难找到问题所在。 我的代码只是从文件中读取所有单词,然后将每个单词、单词位置、句子的开头和结尾存储在一个数组中。数组被输出到另一个文本文件。

我可以阅读直到最后一句话的所有信息,然后我有一个错误。有什么想法吗?

/**
 *  Programmer: fryeguy
 *  Course: 
 *  Program: TxtCrawl for MicroSearch
 *
 *  Algorithm:
 *  TxtCrawl is the component of MicroSearch that reads text
 *  documents for search terms and stores them for
 *  indexing
 *
 *  1. Count words in doc, then initialize
 *     wordsFromDoc array to wordCount
 *  2. Initiate output file for writing.
 *  3. Open input file for reading words.
 *  4. Until reaching EOF:
 *     4.a. Set value for start "get pointer" in startSentence (.tellg()).
 *     4.b. Store value for end "get pointer" in endSentence (.tellg()).
 *     4.c. Reset "get pointer" to startSentence location.
 *     4.d. Until reaching endSentence, Read into the
 *          array theWord, wordPos, startSent, and endSent
 *  5. Write wordsFromDoc array to file
 *  6. When EOF is reached close the files.
 */

#include <iostream>
#include <iomanip>
#include <fstream>
#include <string>   

using namespace std;

struct wordProps        // stores word info to be placed in array
{
    string  theWord;    // stores the word
    int     wordPos;    // stores the position of word
    int     startSent;  // stores the start point of the sentence
    int     endSent;    // stores the end point of the sentence
};

void countWords(string, int&, int&);

int main()
{

    ifstream iFile; // file stream for reading in data
    ofstream oFile; // file stream for writing data

    string  iFileName = "TextFile2.txt";    // name of test file to read from
    string  oFileName = "OutputFile.txt";   // name of test file to write to
    string  aLine = "";                     // stores a line preceeding a newline character (\n)
    string  aWord = "";                     // stores words from doc for indexing
    int     charCount = 0;                  // count of characters in doc
    int     wordCount = 0;                  // count of words in doc
    int     aLineWordCount = 0;             // count of words in a single line being processed
    int     wordBegin = 0;                  // stores location of word in doc
    int     startSentence = 0;              // stores pointer value for start of sentence
    int     endSentence = 0;                // stores pointer value for end of sentence

    /**
     * 1. Count words in doc, then initialize
     *    wordsFromDoc array to wordCount
     */
    countWords(iFileName, charCount, wordCount);
    cout << "charCount: " << charCount << endl; // DEBUG CODE
    cout << "wordCount: " << wordCount << endl; // DEBUG CODE
    wordProps wordsFromDoc[wordCount];
    cout<< "length of array: " << (sizeof(wordsFromDoc) / sizeof(*wordsFromDoc)) << endl;  // DEBUG CODE

    /**
     * 2. Initiate output file for writing
     */
    oFile.open (oFileName.c_str()); // setup output file and write header
    oFile << setw(20) << left << "File Name: " << iFileName << endl;
    oFile << setw(20) << "---------------------------------------" << endl << endl;

    /**
     * 3. Open input file for reading words
     */
    iFile.open (iFileName.c_str());
    if (!iFile.is_open())
        cout << "No such file exists!" << endl;
    else
    {
        /**
         * 4. Until reaching EOF:
         */
        // I have been attempting different counting methods assuming the eof was being reached prematurely
        // The results really have not varied with this code
        // while (iFile.tellg() != charCount) 
        while (!iFile.eof())
        {
            //cout << "count: " << count << endl;
            /**
             * 4.a. Set value for start "get pointer" in startSentence (.tellg()).
             */
            startSentence = iFile.tellg();
            cout << "startSentence: " << startSentence << endl; // DEBUG CODE

            /**
             * 4.b. Store value for end "get pointer" in endSentence (.tellg()).
             */
            getline(iFile, aLine, '.');
            cout << aLine << endl; // DEBUG CODE
            endSentence = iFile.tellg();
            aLine.clear();
            cout << "endSentence: " << endSentence << endl; // DEBUG CODE

            if (!iFile.is_open())
            {
                cout << "The if, iFile.tellg(): " << iFile.tellg() << endl; // DEBUG CODE
                iFile.close();
                iFile.open (iFileName.c_str());
            }

            /**
             * 4.c. Reset "get pointer" to startSentence location.
             */
            iFile.seekg(startSentence);
            cout << "iFile.tellg(): " << iFile.tellg() << endl; // DEBUG CODE

            /**
             * 4.d. Until reaching endSentence, Read into the
             *      array theWord, wordPos, startSent, and endSent
             */

             // As the last line is about to be read there is an error of some sort.
             // My guess is that somehow I exceed the end of the file but my startSentence
             // and endSentence variables are pointing where I think they should.

            for ( ; iFile.tellg() < endSentence; aLineWordCount++)
            {
                wordsFromDoc[aLineWordCount].wordPos = iFile.tellg();
                cout << "wordPos: " << wordsFromDoc[aLineWordCount].wordPos << endl; // DEBUG CODE
                iFile >> wordsFromDoc[aLineWordCount].theWord;
                cout << "theWord: " << wordsFromDoc[aLineWordCount].theWord << endl; // DEBUG CODE
                wordsFromDoc[aLineWordCount].startSent = startSentence;
                cout << "startSent: " << wordsFromDoc[aLineWordCount].startSent << endl; // DEBUG CODE
                wordsFromDoc[aLineWordCount].endSent = endSentence;
                cout << "endSent: " << wordsFromDoc[aLineWordCount].endSent << endl << endl; // DEBUG CODE
                cout << "aLineWordCount: " << aLineWordCount << endl;
            } // end for

        } // end while !=iFile.eof

            // THIS section of code is never reached because of the hang up above.
            /**
             * 5. Write wordsFromDoc array to file
             */
            for (int count = 0; count < aLineWordCount; count++)
            {
                oFile << setw(20) << left
                << wordsFromDoc[count].theWord << " "
                << wordsFromDoc[count].wordPos << " "
                << wordsFromDoc[count].startSent << " "
                << wordsFromDoc[count].endSent << endl;
            }

    } // end else

    /**
     * 6. When EOF is reached close the files.
     */
    iFile.close();
    oFile.close();

// DEBUG CDODE for verifying results
//  for (int count = 0; count < wordCount; count++) {
//      cout << "theWord: " << wordsFromDoc[count].theWord << endl;
//      cout << "wordPos: " << wordsFromDoc[count].wordPos << endl;
//      cout << "startSent: " << wordsFromDoc[count].startSent << endl;
//      cout << "endSent: " << wordsFromDoc[count].endSent << endl << endl;
//  }

}

/**
 * Implement countWords function
 */
void countWords(string theFileName, int &charCount, int &wordCount)
{
    string  theWord = "";
    char    theChar = ' ';
    fstream inFile;

    //count the chars
    inFile.open (theFileName.c_str());
    if (!inFile.is_open())
        cout << "No such file exists!" << endl;
    else
    {
        inFile.get(theChar);
        while (!inFile.eof())
        {
            charCount++;
            inFile.get(theChar);
        }
    }
    inFile.close();

    // count the words
    inFile.open (theFileName.c_str());
    if (!inFile.is_open())
        cout << "No such file exists!" << endl;
    else
    {
        while (!inFile.eof())
        {
            inFile >> theWord;
            wordCount++;
        }
    }
    inFile.close();
}

【问题讨论】:

  • 需要查看似乎失败的输入;该程序似乎适合任意输入。
  • 感谢您的回复。我已经用一些不同的文件进行了测试,这个文本给出了期望的(不期望的)错误:“这是一个要读入搜索引擎爬虫的文本样本。我将键入几个句子,包括句点以提供一些中断。这一行出现在两个换行符。最后一段文字应该可以!当代码到达我在 startSent 和 endSent 值以及 wordPos (iFile.tellg()) 中读取的最后一句时,似乎 iFile 已被释放。
  • 您是否指的是由于句子以感叹号(!)而不是句号结尾而导致的失败?该代码严格编写为仅处理以句点结尾的句子。
  • 我在将其发送给您时就认出了这一点。最初,我编写代码来查找句点作为拆分句子和跟踪句子上下文的方法。这是搜索引擎的一部分,它不仅必须返回搜索词,还必须返回上下文。我计划使用该词所在的句子作为上下文。处理其他标点符号有什么建议吗?
  • 我宁愿在 C 中使用 getchar() 而不是在 C++ 中使用 cin &gt;&gt; achar;但是逐个字符地阅读直到句号(.?!),也许是(:),或者找到EOF。然后在那里设置endOfSentence。不过,检查一下,也许 istream 有一个可以匹配多个分隔符的方法(也许可以被子类化来这样做),就像 scanf 在 C 中所做的那样。

标签: c++ fstream eof


【解决方案1】:

Istream

我查过了。 Istream 没有 getgetline 的化身 一次处理多个分隔符1.

其他人也有同样的问题2。逐字符 IO 是最 实用的解决方案。其他解决方案涉及编码增强版本 当前的 Istream 方法。

一个想法

  1. 立即将整个文件读入内存。
  2. 删除换行符(任何 CR 或 LF)。
  3. 将文档拆分为以每个特殊字符结尾的行 句号定界符,通过在每个定界符后面放置一个一致的标记(LF 或 ETX '\003'),同时将文档写回磁盘。
  4. 现在可以照常处理文档;但改用已知标记 的句点作为分隔符。
  5. 删除保存重新定界文档的临时文件。

一次阅读整个文档不是问题,因为它都在 无论如何,记忆最终;将单词组合在一起的字符串 等于整个文档。将重新分隔的文档写入磁盘后,即可释放内存。

注意事项

1 Istream::get
2 带有 getline 的多个分隔符(在 Code Guru 上讨论)

【讨论】:

  • 感谢您之前的 cmets。我相信我最终发现的主要问题是数据的性质是从不同网站复制和粘贴的文本,并且编码遍布地图。很确定我的代码在意外字符上咳嗽。
最近更新 更多