【问题标题】:Flesch-Kincaid readability testFlesch-Kincaid 可读性测试
【发布时间】:2012-02-27 09:30:03
【问题描述】:

是否有任何处理 Flesch-Kincaid 可读性计算的开源 .Net 库?

维基:http://en.wikipedia.org/wiki/Flesch-Kincaid_readability_test

【问题讨论】:

    标签: c# readability flesch-kincaid


    【解决方案1】:

    不是开源的,但您可以使用 the ReadabilityStatistic interface 委托给 Word。即使您的文档开始时不在 Word 中,您也可以打开 Word(对用户不可见),将文本转储到 Word 中,然后使用 ReadabilityStatistic 计算统计信息。

    【讨论】:

      【解决方案2】:

      如 Flesch-Kincaid 年级水平公式所述:

      https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests

      您需要计算单词、句子和音节。音节可能是最棘手的,尽管句子也需要一些思考。

      这是其他人的用于音节计数的代码到 F# 的两个翻译(即 .NET,您可以在 Visual Studio 中创建一个 F# 项目,然后从您的 C# 项目中引用该项目)。我已经对此进行了基本但不广泛的测试。

      我发现 Ipeirotis 在我的一些测试用例中(一旦我添加了问题单词列表)比 Child 给出了更好的结果。我的测试词是:

      let testWords = [|"abalone";"gracious";"atheism";"unaware"; "seaside";"underwater";"wonderwoman";"biology"|]
      

      Child 的代码尤其是列表末尾有问题。将正则表达式从最长词缀重新排序到最短似乎并不能解决它。

      我的翻译:

      module Readability
      
      open System.Text.RegularExpressions
      //for syllables
      //simpler:
      //https://github.com/ipeirotis/ReadabilityMetrics/blob/master/src/main/java/com/ipeirotis/readability/engine/Syllabify.java
      
      let SyllableCount2 (word:string) =
          let SubSyl = [| "cial"; "tia"; "cius"; "cious"; "giu"; "ion"; "iou"; "sia$"; ".ely$" |]
          let AddSyl = [| "ia"; "riet"; "dien"; "iu"; "io"; "ii"; "[aeiouym]bl$"; "[aeiou]{3}"; "^mc"; "ism$"; "[^aeiouy][^aeiouy]l$"; "[^l]lien"; "^coa[dglx]."; "[^gq]ua[^auieo]"; "dnt$" |]
      
          let mutable tempWord = word.ToLower()
          tempWord <- tempWord.Replace("'", " ")
      
          if problemWordMap.ContainsKey( word ) then
              problemWordMap.[word]
          else if tempWord = "i" || tempWord = "a" then
              1
          else
              if tempWord.EndsWith("e") then
                  tempWord <- tempWord.Substring(0, tempWord.Length - 1)
      
              let phonems = Regex.Split(tempWord, "[^aeiouy]+")
      
              let mutable syl = 0;
      
              for i = 0 to SubSyl.Length - 1 do
                  let syllabe = SubSyl.[i];
                  if Regex.IsMatch( tempWord, syllabe) then 
                      syl <- syl - 1
      
              for i = 0 to AddSyl.Length - 1 do
                  let syllabe = AddSyl.[i];
                  if Regex.IsMatch( tempWord, syllabe) then 
                      syl <- syl + 1
      
              if tempWord.Length = 1 then
                  syl <- syl + 1
      
              for i = 0 to phonems.Length - 1 do
                  if phonems.[i].Length > 0 then
                      syl <- syl + 1
      
              if syl = 0 then
                  syl <- 1
      
              // return
              syl
      
      //https://github.com/DaveChild/Text-Statistics/blob/master/src/DaveChild/TextStatistics/Syllables.php
      
      let problemWordMap =
          dict[
              ("abalone", 4);
              ("abare", 3);
              ("abed" , 2);
              ("abruzzese", 4);
              ("abbruzzese" , 4);
              ("aborigine", 5);
              ("aborigines", 5); //andrew plural (ap)
              ("acreage", 3);
              ("acreage", 3); //ap
              ("adame", 3);
              ("adieu", 2);
              ("adobe", 3);
              ("anemone", 4);
              ("anemones", 4); //ap
              ("apache" , 3);
              ("apaches" , 3); //ap
              ("aphrodite", 4);
              ("apostrophe" , 4);
              ("apostrophes" , 4); //ap
              ("ariadne", 4);
              ("cafe" , 2);
              ("cafes" , 2); //ap
              ("calliope" , 4);
              ("catastrophe", 4);
              ("catastrophes", 4); //ap
              ("chile", 2);
              ("chiles", 2); //ap
              ("chloe", 2);
              ("circe", 2);
              ("coyote" , 3);
              ("coyotes" , 3); //ap
              ("epitome", 4);
              ("forever", 3);
              ("gethsemane" , 4);
              ("guacamole", 4);
              ("guacamoles", 4); //ap
              ("hyperbole", 4);
              ("hyperboles", 4); //ap
              ("jesse", 2);
              ("jukebox", 2);
              ("jukeboxes", 2); //ap
              ("karate" , 3);
              ("karates" , 3); //ap
              ("machete", 3);
              ("maybe", 2);
              ("people" , 2);
              ("recipe" , 3);
              ("sesame" , 3);
              ("shoreline", 2);
              ("simile" , 3);
              ("machetes", 3); //ap
              ("maybes", 2);//ap
              ("peoples" , 2);//ap
              ("recipes" , 3);//ap
              ("sesames" , 3);//ap
              ("shorelines", 2);//ap
              ("similes" , 3);//ap
              ("syncope", 3);
              ("tamale" , 3);
              ("tamales" , 3); //ap
              ("yosemite" , 4);
              ("daphne" , 2);
              ("eurydice" , 4);
              ("euterpe", 3);
              ("hermione" , 4);
              ("penelope" , 4);
              ("persephone" , 4);
              ("phoebe" , 2);
              ("zoe", 2);
          ]
      
      // These syllables would be counted as two but should be one
      let oneSyllableCorrection =
          [|
              "cia(l|$)"; // glacial, acacia
              "tia";
              "cius";
              "cious";
              "[^aeiou]giu";
              "[aeiouy][^aeiouy]ion";
              "iou";
              "sia$";
              "eous$";
              "[oa]gue$";
              ".[^aeiuoycgltdb]{2,}ed$";
              ".ely$";
              //"[cg]h?ed?$";
              //"rved?$";
              //"[aeiouy][dt]es?$";
              //"^[dr]e[aeiou][^aeiou]+$"; // Sorts out deal, deign etc
              //"[aeiouy]rse$"; // Purse, hearse
              "^jua";
              //"nne[ds]?$"; // canadienne
              "uai"; // acquainted
              "eau"; // champeau
              //"pagne[ds]?$"; // champagne
              //"[aeiouy][^aeiuoytdbcgrnzs]h?e[rsd]?$";
              // The following detects words ending with a soft e ending. Don";t
              // mess with it unless you absolutely have to! The following
              // is a list of words you can use to test a new version of
              // this rule (add ";r";, ";s"; and ";d"; where possible to test
              // fully):
              //   - absolve
              //   - acquiesce
              //   - audience
              //   - ache
              //   - acquire
              //   - brunelle
              //   - byrne
              //   - canadienne
              //   - coughed
              //   - curved
              //   - champagne
              //   - designate
              //   - force
              //   - lace
              //   - late
              //   - lathe
              //   - make
              //   - relayed
              //   - scrounge
              //   - side
              //   - sideline
              //   - some
              //   - wide
              //   - taste
              "[aeiouy](b|c|ch|d|dg|f|g|gh|gn|k|l|ll|lv|m|mm|n|nc|ng|nn|p|r|rc|rn|rs|rv|s|sc|sk|sl|squ|ss|st|t|th|v|y|z)e$";
              // For soft e endings with a "d". Test words:
              //   - crunched
              //   - forced
              //   - hated
              //   - sided
              //   - sidelined
              //   - unexploded
              //   - unexplored
              //   - scrounged
              //   - squelched
              //   - forced
              "[aeiouy](b|c|ch|dg|f|g|gh|gn|k|l|lch|ll|lv|m|mm|n|nc|ng|nch|nn|p|r|rc|rn|rs|rv|s|sc|sk|sl|squ|ss|th|v|y|z)ed$";
              // For soft e endings with a "s". Test words:
              //   - absences
              //   - accomplices
              //   - acknowledges
              //   - advantages
              //   - byrnes
              //   - crunches
              //   - forces
              //   - scrounges
              //   - squelches
              "[aeiouy](b|ch|d|f|gh|gn|k|l|lch|ll|lv|m|mm|n|nch|nn|p|r|rn|rs|rv|s|sc|sk|sl|squ|ss|st|t|th|v|y)es$";
              "^busi$";
          |] |> String.concat("|") |> Regex
      
      
      // These syllables would be counted as one but should be two
      let twoSyllableCorrection =
          [|
              "([^s]|^)ia";
              "riet";
              "dien"; // audience
              "iu";
              "io";
              "eo($|[b-df-hj-np-tv-z])";
              "ii";
              "[ou]a$";
              "[aeiouym]bl$";
              "[aeiou]{3}";
              "[aeiou]y[aeiou]";
              "^mc";
              "ism$";
              "asm$";
              "thm$";
              "([^aeiouy])\1l$";
              "[^l]lien";
              "^coa[dglx].";
              "[^gq]ua[^auieo]";
              "dnt$";
              "uity$";
              "[^aeiouy]ie(r|st|t)$";
              "eings?$";
              "[aeiouy]sh?e[rsd]$";
              "iell";
              "dea$";
              "real"; // real, cereal
              "[^aeiou]y[ae]"; // bryan, byerley
              "gean$"; // aegean
              "uen"; // influence, affluence
      
          |] |> String.concat("|") |> Regex
      
      // Single syllable prefixes and suffixes
      let oneSyllableAffix =
          [|
              "^un";
              "^fore";
              "^ware";
              "^none?";
              "^out";
              "^post";
              "^sub";
              "^pre";
              "^pro";
              "^dis";
              "^side";
              "ly$";
              "less$";
              "some$";
              "ful$";
              "ers?$";
              "ness$";
              "cians?$";
              "ments?$";
              "ettes?$";
              "villes?$";
              "ships?$";
              "sides?$";
              "ports?$";
              "shires?$";
              "tion(ed)?$";
      
          |] |> String.concat("|") |> Regex
      
      // Double syllable prefixes and suffixes
      let twoSyllableAffix =
          [|
              "^above";
              "^ant[ie]";
              "^counter";
              "^hyper";
              "^afore";
              "^agri";
              "^in[ft]ra";
              "^inter";
              "^over";
              "^semi";
              "^ultra";
              "^under";
              "^extra";
              "^dia";
              "^micro";
              "^mega";
              "^kilo";
              "^pico";
              "^nano";
              "^macro";
              "berry$";
              "woman$";
              "women$";
      
          |] |> String.concat("|") |> Regex
      
      // Triple syllable prefixes and suffixes
      let threeSyllableAffix =
          [|
              "ology$";
              "ologist$";
              "onomy$";
              "onomist$";
          |] |> String.concat("|") |> Regex
      
      /// <summary>
      /// For each match in pattern, replace match with empty string in input word,
      /// returning bare word and # matches
      /// </summary>
      /// <param name="pattern"></param>
      /// <param name="word"></param>
      let RegexReplace (regex:Regex) word =
          //let affixReplace = new Regex( pattern )
          let matches = regex.Matches(word)
          let mutable bareWord = word
          for aMatch in matches do
              bareWord <- bareWord.Replace(aMatch.Value,"")
          //
          bareWord, matches.Count //need to exclude a group?
      
      let CountMatches (regex:Regex) word =
          //let regex = new Regex( pattern )
          let matches = regex.Matches(word)
          //
          matches.Count
      
      /// <summary>
      /// Counts syllables in word. Assumes word has already been "cleaned"
      /// </summary>
      /// <param name="word"></param>
      let SyllableCount( word : string) =
          if problemWordMap.ContainsKey( word ) then
              problemWordMap.[word]
          else
              //remove and count affixes
              let wordMinus1Affix, oneAffixCount = RegexReplace oneSyllableAffix word
              let wordMinus2Affix, twoAffixCount = RegexReplace twoSyllableAffix wordMinus1Affix
              let wordMinus3Affix, threeAffixCount = RegexReplace threeSyllableAffix wordMinus2Affix
      
              //count word parts
              let vowelSplit = Regex.Split(wordMinus3Affix, "[^aeiouy]")
              let mutable wordPartCount = 0
              for wordPart in vowelSplit do
                  if wordPart.Length > 0 then
                      wordPartCount <- wordPartCount + 1
      
              //base syllable count
              let mutable baseSyllableCount = oneAffixCount + twoAffixCount + threeAffixCount + wordPartCount
      
              //handle degenerate cases
              let oneSyllableCorrectionCount = CountMatches oneSyllableCorrection word //count two as one: subtract
              let twoSyllableCorrectionCount = CountMatches twoSyllableCorrection word //count one as two: add
      
              baseSyllableCount <- baseSyllableCount - oneSyllableCorrectionCount + twoSyllableCorrectionCount
      
              //we always have 1 syllable in a word
              if baseSyllableCount > 0 then
                  baseSyllableCount
              else
                  1
      

      为了处理句子计数,我使用了斯坦福解析器的 nuget 包并创建了这个包装器:

      using System;
      using System.Collections.Generic;
      using System.Linq;
      using System.Text;
      using System.Threading.Tasks;
      using edu.stanford.nlp.process;
      using edu.stanford.nlp.util;
      
      namespace StanfordWrapper
      {
          public class SentenceTokenizer
          {
              public static readonly TokenizerFactory TokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
                      "normalizeParentheses=false,normalizeOtherBrackets=false,invertible=true");
      
              public static List<string> Go( string input )
              {
                  java.io.Reader reader = new java.io.StringReader(input);
                  DocumentPreprocessor dp = new DocumentPreprocessor(reader);
                  dp.setTokenizerFactory(TokenizerFactory);
      
                  List<string> output = new List<string>();
                  foreach (java.util.List sentence in dp)
                  {
                      output.Add(StringUtils.joinWithOriginalWhiteSpace(sentence));
                  }
      
                  return output;
              }
          }
      }
      

      包装器很有帮助,因为解析器在 java 中。 nuget 使用 IKVMC 使其可被 .NET 调用。

      最后对于字数统计,我使用了一些代码来清理/标记化:

      module TextNormalizer
      
      open System;
      open System.Collections.Generic;
      open System.Linq;
      open System.Text.RegularExpressions;
      
      let spaceRegex = new Regex(@"\s+");
      let normalizeTextRegexStrict = new Regex( String.Join("|", [| @"[^\w\s]"; @"[0-9]+"; "_" |]), RegexOptions.Compiled);
      let normalizeTextRegexApostrophe = new Regex( String.Join("|", [| @"[^'\w\s]"; @"[0-9]+"; "_" |]), RegexOptions.Compiled);
      
      /// <summary>
      /// Replaces all punctuation with whitspace, apostrophe optional. Will return string matching original text with punctuation
      /// removed, text lowercased, and words evenly delimited with whitespace
      /// </summary>
      /// <param name="normedLine"></param>
      /// <param name="removeApostrophe"></param>
      let Normalize( normedLine ) ( removeApostrophe ) =
          let normedLine =
              if removeApostrophe then
                  normalizeTextRegexStrict.Replace(normedLine, " "); // replace all punctuation with whitespace
              else
                  normalizeTextRegexApostrophe.Replace(normedLine, " "); // replace all except apostrophe with whitespace
      
          //return
          spaceRegex.Replace( normedLine, " " )  // reduce continguous whitespace to a single space
              .Trim()                                         // get rid of any whitespace on ends
              .ToLower();                                     // lowercase whole thing
      

      有了所有这些东西,计算 FK 就很简单了:

      let FleshKincaidGradeLevel( text ) =
          let sentences = StanfordWrapper.SentenceTokenizer.Go( text ) |> Seq.toArray
      
          let words =  sentences |> Array.map( fun x ->  TextNormalizer.Normalize x false ) |> Array.collect( fun x -> x.Split( ' ' ))
      
          let syllableCount = words |> Array.map SyllableCount2 |> Array.sum
      
          //FKGL formula: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
          ( 0.39 * ( float words.Length) / (float sentences.Length ) ) + ( 11.8 * (float syllableCount ) / ( float words.Length) ) - 15.59
      

      【讨论】:

        【解决方案3】:

        我很惊讶没有用于此的库,但你真的需要它吗?

        如果你能得到原始文本,那么计算就相当简单了。

        查看 this (PHP) 的源代码计数音节就像计数句子一样,使用正则表达式,而不是拆分.!?分割所有元音 aeiouy。

        【讨论】:

        【解决方案4】:

        在 Java 中有一个开源解决方案 - 它不是 .Net,但它是相对清晰的代码,您可能可以翻译:https://github.com/ipeirotis/ReadabilityMetrics(Java 中)它又基于 http://search.cpan.org/author/GREGFAST/Lingua-EN-Syllable-0.251/(Perl 中) .

        【讨论】:

          猜你喜欢
          • 2018-08-22
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 2021-01-10
          • 2012-09-15
          • 1970-01-01
          • 1970-01-01
          相关资源
          最近更新 更多