【发布时间】:2023-12-04 22:07:01
【问题描述】:
我确信这个问题可以相对容易地解决,但我很难找到问题所在。 我的代码只是从文件中读取所有单词,然后将每个单词、单词位置、句子的开头和结尾存储在一个数组中。数组被输出到另一个文本文件。
我可以阅读直到最后一句话的所有信息,然后我有一个错误。有什么想法吗?
/**
* Programmer: fryeguy
* Course:
* Program: TxtCrawl for MicroSearch
*
* Algorithm:
* TxtCrawl is the component of MicroSearch that reads text
* documents for search terms and stores them for
* indexing
*
* 1. Count words in doc, then initialize
* wordsFromDoc array to wordCount
* 2. Initiate output file for writing.
* 3. Open input file for reading words.
* 4. Until reaching EOF:
* 4.a. Set value for start "get pointer" in startSentence (.tellg()).
* 4.b. Store value for end "get pointer" in endSentence (.tellg()).
* 4.c. Reset "get pointer" to startSentence location.
* 4.d. Until reaching endSentence, Read into the
* array theWord, wordPos, startSent, and endSent
* 5. Write wordsFromDoc array to file
* 6. When EOF is reached close the files.
*/
#include <iostream>
#include <iomanip>
#include <fstream>
#include <string>
using namespace std;
struct wordProps // stores word info to be placed in array
{
string theWord; // stores the word
int wordPos; // stores the position of word
int startSent; // stores the start point of the sentence
int endSent; // stores the end point of the sentence
};
void countWords(string, int&, int&);
int main()
{
ifstream iFile; // file stream for reading in data
ofstream oFile; // file stream for writing data
string iFileName = "TextFile2.txt"; // name of test file to read from
string oFileName = "OutputFile.txt"; // name of test file to write to
string aLine = ""; // stores a line preceeding a newline character (\n)
string aWord = ""; // stores words from doc for indexing
int charCount = 0; // count of characters in doc
int wordCount = 0; // count of words in doc
int aLineWordCount = 0; // count of words in a single line being processed
int wordBegin = 0; // stores location of word in doc
int startSentence = 0; // stores pointer value for start of sentence
int endSentence = 0; // stores pointer value for end of sentence
/**
* 1. Count words in doc, then initialize
* wordsFromDoc array to wordCount
*/
countWords(iFileName, charCount, wordCount);
cout << "charCount: " << charCount << endl; // DEBUG CODE
cout << "wordCount: " << wordCount << endl; // DEBUG CODE
wordProps wordsFromDoc[wordCount];
cout<< "length of array: " << (sizeof(wordsFromDoc) / sizeof(*wordsFromDoc)) << endl; // DEBUG CODE
/**
* 2. Initiate output file for writing
*/
oFile.open (oFileName.c_str()); // setup output file and write header
oFile << setw(20) << left << "File Name: " << iFileName << endl;
oFile << setw(20) << "---------------------------------------" << endl << endl;
/**
* 3. Open input file for reading words
*/
iFile.open (iFileName.c_str());
if (!iFile.is_open())
cout << "No such file exists!" << endl;
else
{
/**
* 4. Until reaching EOF:
*/
// I have been attempting different counting methods assuming the eof was being reached prematurely
// The results really have not varied with this code
// while (iFile.tellg() != charCount)
while (!iFile.eof())
{
//cout << "count: " << count << endl;
/**
* 4.a. Set value for start "get pointer" in startSentence (.tellg()).
*/
startSentence = iFile.tellg();
cout << "startSentence: " << startSentence << endl; // DEBUG CODE
/**
* 4.b. Store value for end "get pointer" in endSentence (.tellg()).
*/
getline(iFile, aLine, '.');
cout << aLine << endl; // DEBUG CODE
endSentence = iFile.tellg();
aLine.clear();
cout << "endSentence: " << endSentence << endl; // DEBUG CODE
if (!iFile.is_open())
{
cout << "The if, iFile.tellg(): " << iFile.tellg() << endl; // DEBUG CODE
iFile.close();
iFile.open (iFileName.c_str());
}
/**
* 4.c. Reset "get pointer" to startSentence location.
*/
iFile.seekg(startSentence);
cout << "iFile.tellg(): " << iFile.tellg() << endl; // DEBUG CODE
/**
* 4.d. Until reaching endSentence, Read into the
* array theWord, wordPos, startSent, and endSent
*/
// As the last line is about to be read there is an error of some sort.
// My guess is that somehow I exceed the end of the file but my startSentence
// and endSentence variables are pointing where I think they should.
for ( ; iFile.tellg() < endSentence; aLineWordCount++)
{
wordsFromDoc[aLineWordCount].wordPos = iFile.tellg();
cout << "wordPos: " << wordsFromDoc[aLineWordCount].wordPos << endl; // DEBUG CODE
iFile >> wordsFromDoc[aLineWordCount].theWord;
cout << "theWord: " << wordsFromDoc[aLineWordCount].theWord << endl; // DEBUG CODE
wordsFromDoc[aLineWordCount].startSent = startSentence;
cout << "startSent: " << wordsFromDoc[aLineWordCount].startSent << endl; // DEBUG CODE
wordsFromDoc[aLineWordCount].endSent = endSentence;
cout << "endSent: " << wordsFromDoc[aLineWordCount].endSent << endl << endl; // DEBUG CODE
cout << "aLineWordCount: " << aLineWordCount << endl;
} // end for
} // end while !=iFile.eof
// THIS section of code is never reached because of the hang up above.
/**
* 5. Write wordsFromDoc array to file
*/
for (int count = 0; count < aLineWordCount; count++)
{
oFile << setw(20) << left
<< wordsFromDoc[count].theWord << " "
<< wordsFromDoc[count].wordPos << " "
<< wordsFromDoc[count].startSent << " "
<< wordsFromDoc[count].endSent << endl;
}
} // end else
/**
* 6. When EOF is reached close the files.
*/
iFile.close();
oFile.close();
// DEBUG CDODE for verifying results
// for (int count = 0; count < wordCount; count++) {
// cout << "theWord: " << wordsFromDoc[count].theWord << endl;
// cout << "wordPos: " << wordsFromDoc[count].wordPos << endl;
// cout << "startSent: " << wordsFromDoc[count].startSent << endl;
// cout << "endSent: " << wordsFromDoc[count].endSent << endl << endl;
// }
}
/**
* Implement countWords function
*/
void countWords(string theFileName, int &charCount, int &wordCount)
{
string theWord = "";
char theChar = ' ';
fstream inFile;
//count the chars
inFile.open (theFileName.c_str());
if (!inFile.is_open())
cout << "No such file exists!" << endl;
else
{
inFile.get(theChar);
while (!inFile.eof())
{
charCount++;
inFile.get(theChar);
}
}
inFile.close();
// count the words
inFile.open (theFileName.c_str());
if (!inFile.is_open())
cout << "No such file exists!" << endl;
else
{
while (!inFile.eof())
{
inFile >> theWord;
wordCount++;
}
}
inFile.close();
}
【问题讨论】:
-
需要查看似乎失败的输入;该程序似乎适合任意输入。
-
感谢您的回复。我已经用一些不同的文件进行了测试,这个文本给出了期望的(不期望的)错误:“这是一个要读入搜索引擎爬虫的文本样本。我将键入几个句子,包括句点以提供一些中断。这一行出现在两个换行符。最后一段文字应该可以!当代码到达我在 startSent 和 endSent 值以及 wordPos (iFile.tellg()) 中读取的最后一句时,似乎 iFile 已被释放。
-
您是否指的是由于句子以感叹号(!)而不是句号结尾而导致的失败?该代码严格编写为仅处理以句点结尾的句子。
-
我在将其发送给您时就认出了这一点。最初,我编写代码来查找句点作为拆分句子和跟踪句子上下文的方法。这是搜索引擎的一部分,它不仅必须返回搜索词,还必须返回上下文。我计划使用该词所在的句子作为上下文。处理其他标点符号有什么建议吗?
-
我宁愿在 C 中使用 getchar() 而不是在 C++ 中使用
cin >> achar;但是逐个字符地阅读直到句号(.?!),也许是(:),或者找到EOF。然后在那里设置endOfSentence。不过,检查一下,也许 istream 有一个可以匹配多个分隔符的方法(也许可以被子类化来这样做),就像scanf在 C 中所做的那样。