【问题标题】:Get Longest Common SubString from a list of sentences从句子列表中获取最长的公共子字符串
【发布时间】:2019-09-06 06:29:17
【问题描述】:

鉴于以下几行

                "Alberry K2503 F40 D",
                "Alberry K2503 F40 S",
                "Demi Deco Denver BLK",
                "Demi Deco Denver BRN",
                "Demi Deco Tank",
                "Demi Deco Audi",
                "Samsung S 19 S10",
                "Samsung S 19 S12"

我需要得到一个包含以下内容的列表

 Alberry K2503 F40
 Demi Deco Denver
 Demi Deco
 Samsung S 19

我试图以这种方式实现它。

    class TrieNode
    {
        public string Word { get; set; }

        public TrieNode Parent { get; set; }

        public Dictionary<string, TrieNode> Children { get; set; } = new Dictionary<string, TrieNode>();

        public override string ToString()
        {
            return $"{Word}";
        }
    }

    class Program
    {
        static string JoinSentence(TrieNode node)
        {
            List<string> sentence = new List<string>();
            while (node != null)
            {
                sentence.Insert(0, node.Word);
                node = node.Parent;
            }

            return string.Join(" ", sentence);
        }

        static void GetSentences(TrieNode node, HashSet<string> sentences)
        {
            if (node.Children.Count > 0)
            {
                foreach (var nodeChild in node.Children)
                {
                    GetSentences(nodeChild.Value, sentences);
                }
            }
            else
            {
                if (node.Parent.Children.Count == 1)
                {
                    sentences.Add(JoinSentence(node));
                }
                else
                {
                    bool lastChildren = false;
                    foreach (var child in node.Parent.Children)
                    {
                        if (child.Value != node)
                        {
                            if (child.Value.Children.Count > 0)
                            {
                                lastChildren = true;
                                sentences.Add(JoinSentence(node));
                                break;
                            }
                        }
                    }

                    if (!lastChildren)
                    {
                        sentences.Add(JoinSentence(node.Parent));
                    }
                }
            }
        }

        static void Main(string[] args)
        {
            var root = new TrieNode();
            var sentences = new[]
            {
                "Alberry K2503 F40 D",
                "Alberry K2503 F40 S",
                "Demi Deco Denver BLK",
                "Demi Deco Denver BRN",
                "Demi Deco Tank",
                "Demi Deco Audi",
                "Samsung S 19 S10",
                "Samsung S 19 S12"
            };

            foreach (var sentence in sentences)
            {
                var words = sentence.Split(' ');
                TrieNode node = null;
                foreach (var word in words)
                {
                    if (node == null)
                    {
                        if (root.Children.ContainsKey(word))
                        {
                            node = root.Children[word];
                        }
                        else
                        {
                            node = new TrieNode {Word = word, Parent = root};
                            root.Children.Add(word, node);
                        }
                    }
                    else
                    {
                        if (node.Children.ContainsKey(word))
                        {
                            node = node.Children[word];
                        }
                        else
                        {
                            node.Children.Add(word, node = new TrieNode {Word = word, Parent = node});
                        }
                    }
                }
            }

            var sentencesCommon = new HashSet<string>();

            GetSentences(root, sentencesCommon);
            foreach (var sentence in sentencesCommon)
            {
                Debug.WriteLine(sentence);
            }
        }
    }

它似乎可以工作,但缺少Demi Deco 的结果,其中AudiTank 应被省略。 我想我真的搞砸了正确遍历树并获得独特的句子。看来我正在重新发明轮子。有人会推荐更好的解决方案吗?

谢谢

【问题讨论】:

  • 如果列表中包含"Alberry K2503 F40 D""Alberry F40 K2503 S",这意味着在交换了一些单词之后。
  • 你试过在the Wikipedia page about this problem上实现算法吗?
  • 这两个新句子只有一个共同的起始词“Alverry”,这就是结果。

标签: c# algorithm lcs


【解决方案1】:

您要求最长的公共子字符串,您可以通过将每个字符串相互匹配来做到这一点。但是,您的预期输出表明您只想匹配整个单词,即最多匹配一个空格字符。这适用于您的示例测试数据:

public void Run()
{
    List<string> output = new List<string>();
    for (int i = 0; i < input.Count-1; ++i)
    {
        for (int j = i+1; j< input.Count; ++j)
        {
            string leftMatch = LeftMatch(input[i], input[j]);
            if (leftMatch.Length>0 && !output.Contains(leftMatch))
            {
                output.Add(leftMatch);
            }
        }
    }
    output.ForEach(x=>Console.WriteLine(x));
}
public string LeftMatch(string a, string b)
{
    string result = "";
    for ( int i=0; i<a.Length&& i<b.Length; ++i)
    {
        if (a[i] != b[i])
        {
            if (!result.Contains(" ")) return "";
            return result.Substring(0, result.LastIndexOf(" ", StringComparison.Ordinal));
        }
        result += a[i];
    }
    return result;
}
List<string> input =
new List<string>{
    "Alberry K2503 F40 D",
    "Alberry K2503 F40 S",
    "Demi Deco Denver BLK",
    "Demi Deco Denver BRN",
    "Demi Deco Tank",
    "Demi Deco Audi",
    "Samsung S 19 S10",
    "Samsung S 19 S12"
};

结果如下:

Alberry K2503 F40 
Demi Deco Denver
Demi Deco  
Samsung S 19

【讨论】:

  • 是的,我需要处理单词,而不是字符应该更快,因为我有数千条记录要处理。
  • 您可能会感到惊讶,索引到字符串比拆分字符然后比较字符串更快(无论如何都会遍历封面下的字符)。
  • 我已更新我的答案以提供预期的输出。
  • 我认为 LeftMath 有错误,if (i == 0) 立即返回。
  • 糟糕,是的,在剪切粘贴中遗漏了一点
猜你喜欢
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2014-04-14
相关资源
最近更新 更多