Wednesday, March 3, 2010

Peter Norvig's Spelling Corrector in VB

Some Background: The idea behind this implementation was not to build the shortest or the fastest version of the spelling corrector. I looked at the list of languages that this was implemented in and found VB .NET missing. So I decided to fill that void. I also thought of it as a way to unite the cult of VB .NET programmers with the others.
(!!!Noble Peace Prize nomination here please!!!)

More importantly, I wanted to spell out each of the steps to make it easier to understand the concept. I also wanted to use native libraries so that you can dive into the spelling corrector concepts quickly without first having to learn other technologies like LINQ etc.

Please feel free to post any comments, suggestions for improvement and any bugs you find.

Copy the text below into a VB class file and get the BIG.txt file (http://norvig.com/big.txt).

' VB .NET Implementation of Peter Norvig's Spelling Corrector.
' VERSION 1.0, last updated 03 Mar 10.
'
' Peter Norvig's original article located at
' http://norvig.com/spell-correct.html
'
' Copyright (c) 2010 Shantanu Inamdar
'
' Permission is hereby granted, free of charge, to any person obtaining a copy
' of this software and associated documentation files (the "Software"), to deal
' in the Software without restriction, including without limitation the rights
' to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
' copies of the Software, and to permit persons to whom the Software is
' furnished to do so, subject to the following conditions:

' The above copyright notice and this permission notice shall be included in
' all copies or substantial portions of the Software.

' THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
' EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
' MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
' IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
' ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
' CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
' WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


' Some Background: The idea behind this implementation was not build the shortest
' or the fastest version of the spelling corrector.
' I looked at the list of languages that this was implemented in and found VB .NET
' missing. So I decided to fill that void. I also thought of it as a way to unite
' the VB .NET programmers with the others.
' (!!!Noble Peace Prize nomination here please!!!)
'
' More importantly, I wanted to spell out each of the steps to make it easier to
' understand the concept. I also wanted to use native libraries
' so that you can dive into the spelling corrector concepts quickly
' without first having to learn other technologies like LINQ etc.
'
' Please feel free to post any comments, suggestions for improvement and any bugs
' you find.

Imports System.Collections.Generic

Public Class SimpleSpellingCorrector

Private Const COMMAASCII As Integer = 44
Private Const ALPHABET As String = "abcdefghijklmnopqrstuvwxyz"
Private nWords As New Dictionary(Of String, Integer)


Public Function getCorrection(ByVal word As String) As String
Dim c As String = String.Empty
Dim maxc As Integer = -1
Dim wc As Integer = 0
Dim candidates As System.Collections.Generic.List(Of String)

'Train the model with word occurences in our "dictionary"
nWords = getModel()

'Choose the most probable word with the shortest edit distance
For ed As Integer = 0 To 2
'If we have found a correction, exit loop
If String.Empty <> c Then
Exit For
End If

'Otherwise, start over
c = String.Empty
wc = 0
maxc = -1
candidates = getCandidates(word, ed)
For Each cd As String In candidates
wc = getWordCount(cd)
If wc > maxc Then
maxc = wc
c = cd
End If
Next cd
Next ed

'If no match is found, just send the same word back!
If String.Empty = c Then
c = word
End If

Return c
End Function

'Get the count of how often the word is found in our "dictionary"
'Return 1 for a "new" word
Private Function getWordCount(ByVal word As String) As Integer
If nWords.ContainsKey(word) Then
Return nWords.Item(word)
Else
Return 1
End If
End Function


'Get the big.txt file from http://norvig.com/big.txt
Private Function getModel() As Dictionary(Of String, Integer)
Dim model As New Dictionary(Of String, Integer)
For Each f As System.Text.RegularExpressions.Match In System.Text.RegularExpressions.Regex.Matches(System.IO.File.ReadAllText("big.txt").ToLower(), "[a-z]+", System.Text.RegularExpressions.RegexOptions.Compiled)
If model.ContainsKey(f.Value) Then
model.Item(f.Value) += 1
Else
model.Add(f.Value, 1)
End If
Next f

Return model
End Function

'Get Candidate words that are at the given edit distance
Private Function getCandidates(ByVal word As String, ByVal edits As Integer) As List(Of String)
Dim c As New List(Of String)
Select Case edits
Case 0
Dim wl As New List(Of String)
wl.Add(word)
c.AddRange(getKnown(wl))
Case 1
c.AddRange(getKnown(getEdits1(word)))
Case 2
c.AddRange(getKnownEdits2(word))
Case Else
c.Add(word)
End Select
Return c
End Function

'Get words that are at an edit distance of 1
Private Function getEdits1(ByVal word As String) As List(Of String)

Dim e1 As New List(Of String)
Dim splits As New List(Of String)

'Create a list of comma separated tuples of all possible ways to split the word
For i As Integer = 0 To word.Length
splits.Add(word.Substring(0, i) & "," & word.Substring(i))
Next i

For Each s As String In splits
'Deletes
If String.Empty <> s.Split(Chr(COMMAASCII))(1) Then
e1.Add(s.Split(Chr(COMMAASCII))(0) & s.Split(Chr(COMMAASCII))(1).Substring(1))
End If

'Transposes
If 1 <> s.Split(Chr(COMMAASCII))(1) Then
e1.Add(s.Split(Chr(COMMAASCII))(0) & c & s.Split(Chr(COMMAASCII))(1).Substring(1))
End If


'Replaces
For Each c As Char In ALPHABET.ToCharArray
If String.Empty <> s.Split(Chr(COMMAASCII))(1) Then
e1.Add(s.Split(Chr(COMMAASCII))(0) & c & s.Split(Chr(COMMAASCII)(1).Substring(1))

End If
Next

'Inserts
For Each c As Char In ALPHABET.ToCharArray
e1.Add(s.Split(Chr(COMMAASCII))(0) & c & s.Split(Chr(COMMAASCII))(1))
Next
Next s

Return e1
End Function

'Get Known words that have edit distance of 2
Private Function getKnownEdits2(ByVal word As String) As List(Of String)
Dim ke2 As New List(Of String)
For Each e1 As String In getEdits1(word)
For Each e2 As String In getEdits1(e1)
If nWords.ContainsKey(e2) Then
ke2.Add(e2)
End If
Next e2
Next e1
Return ke2
End Function

'Get Known words; get rid of unknown words
Private Function getKnown(ByRef words As List(Of String)) As List(Of String)
Dim k As New List(Of String)
For Each w As String In words
If nWords.ContainsKey(w) Then
k.Add(w)
End If
Next w
Return k
End Function

End Class