【发布时间】:2012-11-14 18:55:28
【问题描述】:
好的,这是交易的人。我有two text files。每个都包含500 行(句子)。
我已经将它们加载到memory 到它们自己的数组中(数据类型:字符串)。我们将其命名为array A & B。
接下来我得到array A中的first sentence,用SPACE作为分隔符将它拆分成另一个array C,从而得到单词。
然后对于array B中的每个句子,我将其拆分为array D,使用SPACE作为分隔符再次获取单词,并将array C中的每个单词与array D中的每个单词进行比较计算百分比匹配两个句子之间。
我计算array A 中第一句话与array B 中所有句子的平均匹配百分比。
然后我将它存储到一个Array E 中,其中包含array A 的所有句子及其平均匹配百分比。
对于数组 A 中的每个标题,我都按照上述 first sentence 所做的操作。
问题是处理数组 A 中的每个标题大约需要 15 秒。无论如何我可以优化这个时间以更快吗?
硬件: AMD Phenom I 32 位四核
代码:
Imports System.IO
Imports System.Object
Imports System.Xml
Imports System.Text.RegularExpressions
Module Module1
Sub Main()
'Important File Paths
Dim titlesFilePath As String = Environment.CurrentDirectory & "\titles.txt"
Dim xmlTitlesFilePath As String = Environment.CurrentDirectory & "\extractedTitles.txt"
Dim stopWordsFilePath As String = Environment.CurrentDirectory & "\stopWords.txt"
'Import Important Data From Files -> Memory
Dim titles As Array = FileToArray(titlesFilePath)
Dim stopWords As Array = FileToArray(stopWordsFilePath)
Dim xmlDataUnprocessed As Array = FileToArray(xmlTitlesFilePath)
'Delimters To Filter Titles For
Dim userDefinedDelimeters(4, 1)
userDefinedDelimeters(0, 0) = "-"
userDefinedDelimeters(0, 1) = " "
userDefinedDelimeters(1, 0) = ","
userDefinedDelimeters(1, 1) = " "
userDefinedDelimeters(2, 0) = "—"
userDefinedDelimeters(2, 1) = " "
userDefinedDelimeters(3, 0) = "'s"
userDefinedDelimeters(3, 1) = ""
userDefinedDelimeters(4, 0) = "'"
userDefinedDelimeters(4, 1) = " "
'Declare Important Variables
Dim xmlData(xmlDataUnprocessed.Length / 2, 1)
Dim xmlTurn = 0
Dim xmlDataCount = 0
'Create Feed Title/URL Array
For i = 0 To (xmlDataUnprocessed.Length - 1)
If xmlTurn = 0 Then
xmlData(xmlDataCount, 0) = xmlDataUnprocessed(i)
xmlTurn = 1
Else
xmlData(xmlDataCount, 1) = xmlDataUnprocessed(i)
xmlTurn = 0
xmlDataCount += 1
End If
Next
'CPU-Intensive Stuff Occurs
Dim xmlTitle As String
Dim xmlTitleWords As Array
Dim savedTitleWords As Array
Dim titleResults(xmlData.GetUpperBound(0) - 1, 1)
Dim titlePercentageMatch As Integer
Dim numberOfTitlesMatched As Integer
For i = 0 To xmlData.GetUpperBound(0) - 1
Console.WriteLine("Working On Title No. " & i & " Out Of " & xmlData.GetUpperBound(0) - 1)
titlePercentageMatch = 0
numberOfTitlesMatched = 0
xmlTitle = xmlData(i, 0)
xmlTitle = processTitle(stopWords, userDefinedDelimeters, xmlTitle)
xmlTitleWords = xmlTitle.Split(" ")
For Each title In titles
title = processTitle(stopWords, userDefinedDelimeters, title)
savedTitleWords = title.split(" ")
Dim compareResult = compareTitle(xmlTitleWords, savedTitleWords)
If compareResult > 0 Then
titlePercentageMatch += compareResult
numberOfTitlesMatched += 1
End If
Next
titleResults(i, 0) = xmlData(i, 0)
titleResults(i, 1) = (titlePercentageMatch / numberOfTitlesMatched)
Next
For i = 0 To titleResults.GetUpperBound(0) - 1
Console.WriteLine(titleResults(i, 0) & " ---> " & titleResults(i, 1) & vbCrLf)
Next
Console.Read()
End Sub
Function compareTitle(ByRef xmlTitleWords As Array, ByRef savedTitleWords As Array)
Dim NumberOfMatches = 0
For Each xmlWord In xmlTitleWords
For Each savedWord In savedTitleWords
If (xmlWord.ToString.ToLower = savedWord.ToString.ToLower) Then
NumberOfMatches += 1
End If
Next
Next
Return ((NumberOfMatches / xmlTitleWords.Length) * 100)
End Function
Function processTitle(ByRef stopWordArray As Array, ByRef delimArray As Array, ByVal title As String)
title = removeStopWords(stopWordArray, title)
title = removeDelims(delimArray, title)
Return title
End Function
Function removeStopWords(ByRef stopWordsArray As Array, ByVal sentence As String)
For i = 0 To stopWordsArray.Length - 1
If sentence.ToLower.Contains(" " & stopWordsArray(i).ToString.ToLower & " ") = True Then
sentence = Microsoft.VisualBasic.Strings.Replace(sentence, " " & stopWordsArray(i) & " ", " ", 1, -1, Constants.vbTextCompare)
'ElseIf sentence.ToLower.Contains(stopWordsArray(i).ToString.ToLower & " ") = True Then
'sentence = Microsoft.VisualBasic.Strings.Replace(sentence, stopWordsArray(i) & " ", "", 1, -1, Constants.vbTextCompare)
End If
sentence = Regex.Replace(sentence, "\s+", " ")
Dim Words = sentence.ToLower.Split(" ")
If Words(0).ToString.ToLower & " " = stopWordsArray(i).ToString.ToLower & " " Then
sentence = sentence.Remove(0, stopWordsArray(i).ToString.ToLower.Length + 1)
End If
Words = sentence.ToLower.Split(" ")
Dim LastWord = Words(Words.Length - 1)
'Console.WriteLine(LastWord & "++")
If " " & LastWord.ToString.ToLower = " " & stopWordsArray(i).ToString.ToLower Then
sentence = sentence.Remove(sentence.Length - 1 - LastWord.Length, stopWordsArray(i).ToString.ToLower.Length + 1)
End If
Next
sentence = Regex.Replace(sentence, "\s+", " ")
Return sentence
End Function
Function removeDelims(ByRef delimArray As Array, ByVal sentence As String)
For i = 0 To delimArray.GetUpperBound(0) - 1
sentence = sentence.Replace(delimArray(i, 0), delimArray(i, 1))
Next
sentence = Regex.Replace(sentence, "\s+", " ")
Return sentence
End Function
Function FileToArray(ByVal filePath As String) As String()
Dim content As String
Dim lines As New ArrayList
Dim sr As System.IO.StreamReader
' read the file's lines into an ArrayList
Try
sr = New System.IO.StreamReader(filePath)
Do While sr.Peek() >= 0
lines.Add(sr.ReadLine())
Loop
Finally
If Not sr Is Nothing Then sr.Close()
End Try
' convert from ArrayList to a String array
Return CType(lines.ToArray(GetType(String)), String())
End Function
End Module
编辑:我希望它不会太混乱。对于那个很抱歉! 编辑 2: 提供酱汁:P
【问题讨论】:
-
你有一个错误:如果
savedTitleWords.Length > xmlTitleWords.Length和xmlTitleWords包含所有savedTitleWords,NumberOfMatches / xmlTitleWords.Length可能会导致错误的 100% 匹配。 -
编辑:感谢您的澄清!嗯,算法肯定需要调整 :) 基本上,整个程序是为了一个简单的实验,我正在尝试找到与我需要的以前文章相似的文章标题。关于如何实施这样的计划有什么建议吗?我绞尽脑汁,但这只是原型设计阶段,需要更多地研究匹配算法,任何帮助都会非常感激:)
-
如果外部循环中的当前标题是 Hello World 并且内部循环中的当前标题是 Hello World How Are You 那么你会得到两场比赛。当您将 2 除以外部循环的长度时,您会得到 1 (100%)……但标题与 100% 不匹配。
-
这听起来确实是一个有趣的项目。也许您需要定义“相似”是什么。例如“相似的意思是 >= 90% 的共同词,不包括标点符号和不重要的词,如 the”。您还可能希望根据标题长度在滑动比例上制作百分比(例如,具有 20 个单词的标题可能“足够相似”,只有 50% 相同)。我很好奇百分比是有意义的。你可以进行语义分析,但这是一个全新的复杂程度......
-
你可以看看n-grams。您可能需要考虑词序。您也可以同时尝试几种算法,比较结果并选择您认为最好的算法。
标签: .net vb.net algorithm optimization