【发布时间】:2019-03-09 08:13:39
【问题描述】:
我有下面的代码,它可以完美地创建一个字符串(大量 .txt 文件)中出现的单词的 CSV 列表,如下所示:
Name;Total
THE;23562
OF;15954
AND;15318
IN;12159
TO;11879
A;11145
I;6135
WAS;6045
etc...
我现在想要的是两个单词对,如果证明足够简单的话,甚至可能是三个单词对。所以像
Name;Total
OF THE;25
FROM THE;20
BY WHICH;13
OF WHICH;5
etc...
如何修改现有代码以检查成对而不是单个单词?
//chrisjopa.com/2016/04/21/counting-word-frequencies-with-javascript/
var fs = require('fs');
var file = 'INPUT.txt';
//Create Headers for the CSV File
const createCsvWriter = require('csv-writer').createObjectCsvWriter;
const csvWriter = createCsvWriter({
//Define Pathname to your choice
path: 'Data1.csv',
header: [
{id: 'name', title: 'Name'},
{id: 'total', title: 'Total'},
]
});
// read file from current directory
fs.readFile(file, 'utf8', function (err, data) {
if (err) throw err;
var wordsArray = splitByWords(data);
var wordsMap = createWordMap(wordsArray);
var finalWordsArray = sortByCount(wordsMap);
//Write CSV Output File
csvWriter
.writeRecords(finalWordsArray)
.then(()=> console.log('DONE'));
});
function splitByWords (text) {
// Removes all special characters, then white spaces,
//then converts to all capital letters, then splits the words
var noPunctuation = text.replace(/[\.,-\/#!$%\^&\*;:{}�=\-_'`’~"()@\+\?><\[\]\+]/g, '');
var noExtraSpaces = noPunctuation.replace(/\s{2,}/g," ");
var allUpperCase = noExtraSpaces.toUpperCase();
var wordsArray = allUpperCase.split(/\s+/);
return wordsArray;
}
//This is the part in the code that I feel is the place to check for word
//pairs, but I'm not sure how I'm supposed to write it.
function createWordMap (wordsArray, ) {
// create map for word counts
var wordsMap = {};
wordsArray.forEach(function (key) {
if (wordsMap.hasOwnProperty(key)) {
wordsMap[key]++;
} else {
wordsMap[key] = 1;
}
});
return wordsMap;
}
function sortByCount (wordsMap) {
// sort by count in descending order
var finalWordsArray = [];
finalWordsArray = Object.keys(wordsMap).map(function(key) {
return {
name: key,
total: wordsMap[key]
};
});
finalWordsArray.sort(function(a, b) {
return b.total - a.total;
});
return finalWordsArray;
}
【问题讨论】:
-
对不起,我需要补充一点,代码应该检查字符串中出现的每个单词对。例如我刚刚写的文字:(对不起,我需要,需要,添加,添加,等等......)
-
不要将其视为一对单词
([a-zA-Z]+?){2}-- 将其视为一个字符串,([a-zA-Z]+?\s[a-zA-Z]+?)之间有一个空格 -
@Seabottom,见this answer,它可能会有所帮助,因为它似乎在回答同样的问题。
标签: javascript count word