Flesch-Kincaid 可读性测试答案

【问题标题】：Flesch-Kincaid readability testFlesch-Kincaid 可读性测试
【发布时间】：2012-02-27 09:30:03
【问题描述】：

是否有任何处理 Flesch-Kincaid 可读性计算的开源 .Net 库？

维基：http://en.wikipedia.org/wiki/Flesch-Kincaid_readability_test

【问题讨论】：

【解决方案1】：

不是开源的，但您可以使用 the ReadabilityStatistic interface 委托给 Word。即使您的文档开始时不在 Word 中，您也可以打开 Word（对用户不可见），将文本转储到 Word 中，然后使用 ReadabilityStatistic 计算统计信息。

【讨论】：

【解决方案2】：

如 Flesch-Kincaid 年级水平公式所述：

https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests

您需要计算单词、句子和音节。音节可能是最棘手的，尽管句子也需要一些思考。

这是其他人的用于音节计数的代码到 F# 的两个翻译（即 .NET，您可以在 Visual Studio 中创建一个 F# 项目，然后从您的 C# 项目中引用该项目）。我已经对此进行了基本但不广泛的测试。

我发现 Ipeirotis 在我的一些测试用例中（一旦我添加了问题单词列表）比 Child 给出了更好的结果。我的测试词是：

let testWords = [|"abalone";"gracious";"atheism";"unaware"; "seaside";"underwater";"wonderwoman";"biology"|]

Child 的代码尤其是列表末尾有问题。将正则表达式从最长词缀重新排序到最短似乎并不能解决它。

我的翻译：

module Readability

open System.Text.RegularExpressions
//for syllables
//simpler:
//https://github.com/ipeirotis/ReadabilityMetrics/blob/master/src/main/java/com/ipeirotis/readability/engine/Syllabify.java

let SyllableCount2 (word:string) =
    let SubSyl = [| "cial"; "tia"; "cius"; "cious"; "giu"; "ion"; "iou"; "sia$"; ".ely$" |]
    let AddSyl = [| "ia"; "riet"; "dien"; "iu"; "io"; "ii"; "[aeiouym]bl$"; "[aeiou]{3}"; "^mc"; "ism$"; "[^aeiouy][^aeiouy]l$"; "[^l]lien"; "^coa[dglx]."; "[^gq]ua[^auieo]"; "dnt$" |]

    let mutable tempWord = word.ToLower()
    tempWord <- tempWord.Replace("'", " ")

    if problemWordMap.ContainsKey( word ) then
        problemWordMap.[word]
    else if tempWord = "i" || tempWord = "a" then
        1
    else
        if tempWord.EndsWith("e") then
            tempWord <- tempWord.Substring(0, tempWord.Length - 1)

        let phonems = Regex.Split(tempWord, "[^aeiouy]+")

        let mutable syl = 0;

        for i = 0 to SubSyl.Length - 1 do
            let syllabe = SubSyl.[i];
            if Regex.IsMatch( tempWord, syllabe) then 
                syl <- syl - 1

        for i = 0 to AddSyl.Length - 1 do
            let syllabe = AddSyl.[i];
            if Regex.IsMatch( tempWord, syllabe) then 
                syl <- syl + 1

        if tempWord.Length = 1 then
            syl <- syl + 1

        for i = 0 to phonems.Length - 1 do
            if phonems.[i].Length > 0 then
                syl <- syl + 1

        if syl = 0 then
            syl <- 1

        // return
        syl

//https://github.com/DaveChild/Text-Statistics/blob/master/src/DaveChild/TextStatistics/Syllables.php

let problemWordMap =
    dict[
        ("abalone", 4);
        ("abare", 3);
        ("abed" , 2);
        ("abruzzese", 4);
        ("abbruzzese" , 4);
        ("aborigine", 5);
        ("aborigines", 5); //andrew plural (ap)
        ("acreage", 3);
        ("acreage", 3); //ap
        ("adame", 3);
        ("adieu", 2);
        ("adobe", 3);
        ("anemone", 4);
        ("anemones", 4); //ap
        ("apache" , 3);
        ("apaches" , 3); //ap
        ("aphrodite", 4);
        ("apostrophe" , 4);
        ("apostrophes" , 4); //ap
        ("ariadne", 4);
        ("cafe" , 2);
        ("cafes" , 2); //ap
        ("calliope" , 4);
        ("catastrophe", 4);
        ("catastrophes", 4); //ap
        ("chile", 2);
        ("chiles", 2); //ap
        ("chloe", 2);
        ("circe", 2);
        ("coyote" , 3);
        ("coyotes" , 3); //ap
        ("epitome", 4);
        ("forever", 3);
        ("gethsemane" , 4);
        ("guacamole", 4);
        ("guacamoles", 4); //ap
        ("hyperbole", 4);
        ("hyperboles", 4); //ap
        ("jesse", 2);
        ("jukebox", 2);
        ("jukeboxes", 2); //ap
        ("karate" , 3);
        ("karates" , 3); //ap
        ("machete", 3);
        ("maybe", 2);
        ("people" , 2);
        ("recipe" , 3);
        ("sesame" , 3);
        ("shoreline", 2);
        ("simile" , 3);
        ("machetes", 3); //ap
        ("maybes", 2);//ap
        ("peoples" , 2);//ap
        ("recipes" , 3);//ap
        ("sesames" , 3);//ap
        ("shorelines", 2);//ap
        ("similes" , 3);//ap
        ("syncope", 3);
        ("tamale" , 3);
        ("tamales" , 3); //ap
        ("yosemite" , 4);
        ("daphne" , 2);
        ("eurydice" , 4);
        ("euterpe", 3);
        ("hermione" , 4);
        ("penelope" , 4);
        ("persephone" , 4);
        ("phoebe" , 2);
        ("zoe", 2);
    ]

// These syllables would be counted as two but should be one
let oneSyllableCorrection =
    [|
        "cia(l|$)"; // glacial, acacia
        "tia";
        "cius";
        "cious";
        "[^aeiou]giu";
        "[aeiouy][^aeiouy]ion";
        "iou";
        "sia$";
        "eous$";
        "[oa]gue$";
        ".[^aeiuoycgltdb]{2,}ed$";
        ".ely$";
        //"[cg]h?ed?$";
        //"rved?$";
        //"[aeiouy][dt]es?$";
        //"^[dr]e[aeiou][^aeiou]+$"; // Sorts out deal, deign etc
        //"[aeiouy]rse$"; // Purse, hearse
        "^jua";
        //"nne[ds]?$"; // canadienne
        "uai"; // acquainted
        "eau"; // champeau
        //"pagne[ds]?$"; // champagne
        //"[aeiouy][^aeiuoytdbcgrnzs]h?e[rsd]?$";
        // The following detects words ending with a soft e ending. Don";t
        // mess with it unless you absolutely have to! The following
        // is a list of words you can use to test a new version of
        // this rule (add ";r";, ";s"; and ";d"; where possible to test
        // fully):
        //   - absolve
        //   - acquiesce
        //   - audience
        //   - ache
        //   - acquire
        //   - brunelle
        //   - byrne
        //   - canadienne
        //   - coughed
        //   - curved
        //   - champagne
        //   - designate
        //   - force
        //   - lace
        //   - late
        //   - lathe
        //   - make
        //   - relayed
        //   - scrounge
        //   - side
        //   - sideline
        //   - some
        //   - wide
        //   - taste
        "[aeiouy](b|c|ch|d|dg|f|g|gh|gn|k|l|ll|lv|m|mm|n|nc|ng|nn|p|r|rc|rn|rs|rv|s|sc|sk|sl|squ|ss|st|t|th|v|y|z)e$";
        // For soft e endings with a "d". Test words:
        //   - crunched
        //   - forced
        //   - hated
        //   - sided
        //   - sidelined
        //   - unexploded
        //   - unexplored
        //   - scrounged
        //   - squelched
        //   - forced
        "[aeiouy](b|c|ch|dg|f|g|gh|gn|k|l|lch|ll|lv|m|mm|n|nc|ng|nch|nn|p|r|rc|rn|rs|rv|s|sc|sk|sl|squ|ss|th|v|y|z)ed$";
        // For soft e endings with a "s". Test words:
        //   - absences
        //   - accomplices
        //   - acknowledges
        //   - advantages
        //   - byrnes
        //   - crunches
        //   - forces
        //   - scrounges
        //   - squelches
        "[aeiouy](b|ch|d|f|gh|gn|k|l|lch|ll|lv|m|mm|n|nch|nn|p|r|rn|rs|rv|s|sc|sk|sl|squ|ss|st|t|th|v|y)es$";
        "^busi$";
    |] |> String.concat("|") |> Regex


// These syllables would be counted as one but should be two
let twoSyllableCorrection =
    [|
        "([^s]|^)ia";
        "riet";
        "dien"; // audience
        "iu";
        "io";
        "eo($|[b-df-hj-np-tv-z])";
        "ii";
        "[ou]a$";
        "[aeiouym]bl$";
        "[aeiou]{3}";
        "[aeiou]y[aeiou]";
        "^mc";
        "ism$";
        "asm$";
        "thm$";
        "([^aeiouy])\1l$";
        "[^l]lien";
        "^coa[dglx].";
        "[^gq]ua[^auieo]";
        "dnt$";
        "uity$";
        "[^aeiouy]ie(r|st|t)$";
        "eings?$";
        "[aeiouy]sh?e[rsd]$";
        "iell";
        "dea$";
        "real"; // real, cereal
        "[^aeiou]y[ae]"; // bryan, byerley
        "gean$"; // aegean
        "uen"; // influence, affluence

    |] |> String.concat("|") |> Regex

// Single syllable prefixes and suffixes
let oneSyllableAffix =
    [|
        "^un";
        "^fore";
        "^ware";
        "^none?";
        "^out";
        "^post";
        "^sub";
        "^pre";
        "^pro";
        "^dis";
        "^side";
        "ly$";
        "less$";
        "some$";
        "ful$";
        "ers?$";
        "ness$";
        "cians?$";
        "ments?$";
        "ettes?$";
        "villes?$";
        "ships?$";
        "sides?$";
        "ports?$";
        "shires?$";
        "tion(ed)?$";

    |] |> String.concat("|") |> Regex

// Double syllable prefixes and suffixes
let twoSyllableAffix =
    [|
        "^above";
        "^ant[ie]";
        "^counter";
        "^hyper";
        "^afore";
        "^agri";
        "^in[ft]ra";
        "^inter";
        "^over";
        "^semi";
        "^ultra";
        "^under";
        "^extra";
        "^dia";
        "^micro";
        "^mega";
        "^kilo";
        "^pico";
        "^nano";
        "^macro";
        "berry$";
        "woman$";
        "women$";

    |] |> String.concat("|") |> Regex

// Triple syllable prefixes and suffixes
let threeSyllableAffix =
    [|
        "ology$";
        "ologist$";
        "onomy$";
        "onomist$";
    |] |> String.concat("|") |> Regex

/// <summary>
/// For each match in pattern, replace match with empty string in input word,
/// returning bare word and # matches
/// </summary>
/// <param name="pattern"></param>
/// <param name="word"></param>
let RegexReplace (regex:Regex) word =
    //let affixReplace = new Regex( pattern )
    let matches = regex.Matches(word)
    let mutable bareWord = word
    for aMatch in matches do
        bareWord <- bareWord.Replace(aMatch.Value,"")
    //
    bareWord, matches.Count //need to exclude a group?

let CountMatches (regex:Regex) word =
    //let regex = new Regex( pattern )
    let matches = regex.Matches(word)
    //
    matches.Count

/// <summary>
/// Counts syllables in word. Assumes word has already been "cleaned"
/// </summary>
/// <param name="word"></param>
let SyllableCount( word : string) =
    if problemWordMap.ContainsKey( word ) then
        problemWordMap.[word]
    else
        //remove and count affixes
        let wordMinus1Affix, oneAffixCount = RegexReplace oneSyllableAffix word
        let wordMinus2Affix, twoAffixCount = RegexReplace twoSyllableAffix wordMinus1Affix
        let wordMinus3Affix, threeAffixCount = RegexReplace threeSyllableAffix wordMinus2Affix

        //count word parts
        let vowelSplit = Regex.Split(wordMinus3Affix, "[^aeiouy]")
        let mutable wordPartCount = 0
        for wordPart in vowelSplit do
            if wordPart.Length > 0 then
                wordPartCount <- wordPartCount + 1

        //base syllable count
        let mutable baseSyllableCount = oneAffixCount + twoAffixCount + threeAffixCount + wordPartCount

        //handle degenerate cases
        let oneSyllableCorrectionCount = CountMatches oneSyllableCorrection word //count two as one: subtract
        let twoSyllableCorrectionCount = CountMatches twoSyllableCorrection word //count one as two: add

        baseSyllableCount <- baseSyllableCount - oneSyllableCorrectionCount + twoSyllableCorrectionCount

        //we always have 1 syllable in a word
        if baseSyllableCount > 0 then
            baseSyllableCount
        else
            1

为了处理句子计数，我使用了斯坦福解析器的 nuget 包并创建了这个包装器：

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using edu.stanford.nlp.process;
using edu.stanford.nlp.util;

namespace StanfordWrapper
{
    public class SentenceTokenizer
    {
        public static readonly TokenizerFactory TokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
                "normalizeParentheses=false,normalizeOtherBrackets=false,invertible=true");

        public static List<string> Go( string input )
        {
            java.io.Reader reader = new java.io.StringReader(input);
            DocumentPreprocessor dp = new DocumentPreprocessor(reader);
            dp.setTokenizerFactory(TokenizerFactory);

            List<string> output = new List<string>();
            foreach (java.util.List sentence in dp)
            {
                output.Add(StringUtils.joinWithOriginalWhiteSpace(sentence));
            }

            return output;
        }
    }
}

包装器很有帮助，因为解析器在 java 中。 nuget 使用 IKVMC 使其可被 .NET 调用。

最后对于字数统计，我使用了一些代码来清理/标记化：

module TextNormalizer

open System;
open System.Collections.Generic;
open System.Linq;
open System.Text.RegularExpressions;

let spaceRegex = new Regex(@"\s+");
let normalizeTextRegexStrict = new Regex( String.Join("|", [| @"[^\w\s]"; @"[0-9]+"; "_" |]), RegexOptions.Compiled);
let normalizeTextRegexApostrophe = new Regex( String.Join("|", [| @"[^'\w\s]"; @"[0-9]+"; "_" |]), RegexOptions.Compiled);

/// <summary>
/// Replaces all punctuation with whitspace, apostrophe optional. Will return string matching original text with punctuation
/// removed, text lowercased, and words evenly delimited with whitespace
/// </summary>
/// <param name="normedLine"></param>
/// <param name="removeApostrophe"></param>
let Normalize( normedLine ) ( removeApostrophe ) =
    let normedLine =
        if removeApostrophe then
            normalizeTextRegexStrict.Replace(normedLine, " "); // replace all punctuation with whitespace
        else
            normalizeTextRegexApostrophe.Replace(normedLine, " "); // replace all except apostrophe with whitespace

    //return
    spaceRegex.Replace( normedLine, " " )  // reduce continguous whitespace to a single space
        .Trim()                                         // get rid of any whitespace on ends
        .ToLower();                                     // lowercase whole thing

有了所有这些东西，计算 FK 就很简单了：

let FleshKincaidGradeLevel( text ) =
    let sentences = StanfordWrapper.SentenceTokenizer.Go( text ) |> Seq.toArray

    let words =  sentences |> Array.map( fun x ->  TextNormalizer.Normalize x false ) |> Array.collect( fun x -> x.Split( ' ' ))

    let syllableCount = words |> Array.map SyllableCount2 |> Array.sum

    //FKGL formula: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
    ( 0.39 * ( float words.Length) / (float sentences.Length ) ) + ( 11.8 * (float syllableCount ) / ( float words.Length) ) - 15.59

【讨论】：

【解决方案3】：

我很惊讶没有用于此的库，但你真的需要它吗？

如果你能得到原始文本，那么计算就相当简单了。

查看 this (PHP) 的源代码计数音节就像计数句子一样，使用正则表达式，而不是拆分.！？分割所有元音 aeiouy。

【讨论】：

这是一个非常粗略的假设，请参阅stackoverflow.com/a/1076924/1226839 了解有关原因的更多信息（或查看这句话中的多元音词 :-)

【解决方案4】：

在 Java 中有一个开源解决方案 - 它不是 .Net，但它是相对清晰的代码，您可能可以翻译：https://github.com/ipeirotis/ReadabilityMetrics（Java 中）它又基于 http://search.cpan.org/author/GREGFAST/Lingua-EN-Syllable-0.251/（Perl 中） .

【讨论】：