【问题标题】:How do I check for word pairs in a string?如何检查字符串中的单词对?
【发布时间】:2019-03-09 08:13:39
【问题描述】:

我有下面的代码,它可以完美地创建一个字符串(大量 .txt 文件)中出现的单词的 CSV 列表,如下所示:

Name;Total
THE;23562
OF;15954
AND;15318
IN;12159
TO;11879
A;11145
I;6135
WAS;6045
etc...

我现在想要的是两个单词对,如果证明足够简单的话,甚至可能是三个单词对。所以像

Name;Total
OF THE;25
FROM THE;20
BY WHICH;13
OF WHICH;5
etc...

如何修改现有代码以检查成对而不是单个单词?

//chrisjopa.com/2016/04/21/counting-word-frequencies-with-javascript/

var fs = require('fs');
var file = 'INPUT.txt';

//Create Headers for the CSV File
const createCsvWriter = require('csv-writer').createObjectCsvWriter;  
const csvWriter = createCsvWriter({  

//Define Pathname to your choice
  path: 'Data1.csv',
  header: [
    {id: 'name', title: 'Name'},
    {id: 'total', title: 'Total'},
  ]
});

// read file from current directory
fs.readFile(file, 'utf8', function (err, data) {

  if (err) throw err;

  var wordsArray = splitByWords(data);
  var wordsMap = createWordMap(wordsArray);
  var finalWordsArray = sortByCount(wordsMap);

//Write CSV Output File
  csvWriter  
  .writeRecords(finalWordsArray)
  .then(()=> console.log('DONE'));

});


function splitByWords (text) {

  // Removes all special characters, then white spaces, 
  //then converts to all capital letters, then splits the words
  var noPunctuation = text.replace(/[\.,-\/#!$%\^&\*;:{}�=\-_'`’~"()@\+\?><\[\]\+]/g, '');
  var noExtraSpaces = noPunctuation.replace(/\s{2,}/g," ");
  var allUpperCase  = noExtraSpaces.toUpperCase();
  var wordsArray    = allUpperCase.split(/\s+/);
  return wordsArray;
}

//This is the part in the code that I feel is the place to check for word 
//pairs, but I'm not sure how I'm supposed to write it.
function createWordMap (wordsArray, ) {

  // create map for word counts
  var wordsMap = {};

  wordsArray.forEach(function (key) {
    if (wordsMap.hasOwnProperty(key)) {
      wordsMap[key]++;
    } else {
      wordsMap[key] = 1;
    }
  });

  return wordsMap;

}


function sortByCount (wordsMap) {

  // sort by count in descending order
  var finalWordsArray = [];
  finalWordsArray = Object.keys(wordsMap).map(function(key) {
    return {

      name: key,
      total: wordsMap[key]

    };
  });

  finalWordsArray.sort(function(a, b) {
    return b.total - a.total;
  });

  return finalWordsArray;

}

【问题讨论】:

  • 对不起,我需要补充一点,代码应该检查字符串中出现的每个单词对。例如我刚刚写的文字:(对不起,我需要,需要,添加,添加,等等......)
  • 不要将其视为一对单词([a-zA-Z]+?){2} -- 将其视为一个字符串,([a-zA-Z]+?\s[a-zA-Z]+?) 之间有一个空格
  • @Seabottom,见this answer,它可能会有所帮助,因为它似乎在回答同样的问题。

标签: javascript count word


【解决方案1】:

wordsArray,创建另一个数组,将每对单词放在一起。例如,来自wordsArray

['Foo', 'Bar', 'Baz', 'Buzz']

创建:

['Foo Bar', 'Bar Baz', 'Baz Buzz']

然后,您可以使用 完全相同的函数,您已经必须计算每对出现的次数 - 只需调用 createWordMap (然后调用 sortByCount)。例如:

const wordsArray = ['Foo', 'Bar', 'Baz', 'Buzz', 'Foo', 'Bar'];
const wordPairsArray = [];
for (let i = 1; i < wordsArray.length; i++) {
  wordPairsArray.push(wordsArray[i - 1] + ' ' + wordsArray[i]);
}
const wordPairMap = createWordMap(wordPairsArray);
const wordPairCount = sortByCount(wordPairMap);
console.log(wordPairCount);


// the following is your original code:
function createWordMap(wordsArray, ) {
  // create map for word counts
  var wordsMap = {};
  wordsArray.forEach(function(key) {
    if (wordsMap.hasOwnProperty(key)) {
      wordsMap[key]++;
    } else {
      wordsMap[key] = 1;
    }
  });
  return wordsMap;
}
function sortByCount(wordsMap) {
  // sort by count in descending order
  var finalWordsArray = [];
  finalWordsArray = Object.keys(wordsMap).map(function(key) {
    return {
      name: key,
      total: wordsMap[key]

    };
  });
  finalWordsArray.sort(function(a, b) {
    return b.total - a.total;
  });
  return finalWordsArray;
}

要将其扩展到不仅仅是对,只需更改循环以将动态数量的元素连接在一起:

function combineWords(words, wordsInItem) {
  const items = [];
  for (let i = wordsInItem - 1; i < words.length; i++) {
    const start = i - (wordsInItem - 1);
    const end = i + 1;
    items.push(words.slice(start, end).join(' '));
  }
  return items;
}
function getCount(words, wordsInItem) {
  const combinedWords = combineWords(words, wordsInItem);
  const map = createWordMap(combinedWords);
  const count = sortByCount(map);
  console.log(count);
}

getCount(['Foo', 'Bar', 'Baz', 'Buzz', 'Foo', 'Bar'], 2);
getCount(['Foo', 'Bar', 'Baz', 'Buzz', 'Foo', 'Bar', 'Baz'], 3);





// the following is your original code:
function createWordMap(wordsArray, ) {
  // create map for word counts
  var wordsMap = {};
  wordsArray.forEach(function(key) {
    if (wordsMap.hasOwnProperty(key)) {
      wordsMap[key]++;
    } else {
      wordsMap[key] = 1;
    }
  });
  return wordsMap;
}
function sortByCount(wordsMap) {
  // sort by count in descending order
  var finalWordsArray = [];
  finalWordsArray = Object.keys(wordsMap).map(function(key) {
    return {
      name: key,
      total: wordsMap[key]

    };
  });
  finalWordsArray.sort(function(a, b) {
    return b.total - a.total;
  });
  return finalWordsArray;
}

【讨论】:

  • 为什么不可能?通过从wordsArray 构造一个新数组,您拥有一个包含字符串中所有单词对的数组。不需要您在原始实现中已经拥有的额外信息,您只需要稍微操作一下即可。
  • 是的,抱歉,我一发布就意识到,这就是我删除评论的原因。这正是我正在寻找的解决方案,非常感谢。 :)
猜你喜欢
  • 1970-01-01
  • 1970-01-01
  • 2013-05-02
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
相关资源
最近更新 更多