Cleaning up our previous work
Developing software has a lot of fun parts: solving problems, designing a system, experimenting with what you have finished so far…it has a lot of interesting and delightful things to be done. Unfortunately (or fortunately, depending on how you look at it), developing software also has a couple less delightful parts: debugging, testing, and least of all, cleaning your implementation.
Today we’re going to cover cleaning up our previous (mess) of code, I’m going to post the file I’ve been working with below, but if you’ve been following along you probably have one just as big of a mess:
#I "bin\\Release\\"
#r "Extensions.dll"
open System.Net
type UnicodeConfusables = { Original : int; Replacement : int array; Field3 : string }
let mapToConfusable =
function
| [|a; b; c|] ->
{ Original = a |> String.trim |> Int.fromHex
Replacement = b |> String.trim |> String.split ' ' |> Array.map Int.fromHex
Field3 = c |> String.trim } |> Some
| _ -> None
let file = "http://www.unicode.org/Public/security/10.0.0/confusables.txt"
let data =
use wc = new WebClient()
wc.DownloadString(file)
let unicodeConfusables =
data
|> String.split '\n'
|> Seq.filter (String.startsWith "#" >> not)
|> Seq.map (Seq.takeWhile ((<>) '#') >> Seq.toArray >> String.implode >> String.split ';')
|> Seq.choose mapToConfusable
|> Seq.toArray
let obsfucationConfusables =
let itemToConfusable (orig, repl) =
{ Original = (orig, 0) ||> Char.toCodePoint
Replacement = repl |> String.toCodePoints |> Seq.toArray
Field3 = "OBS" }
[|("1", "i"); ("1", "l")
("2", "z"); ("2", "s")
("3", "e")
("4", "a")
("5", "s"); ("5", "z")
("6", "g"); ("6", "b")
("7", "t")
("8", "b")
("9", "g")
("0", "o")
("\\", "i"); ("\\", "l")
("/", "i"); ("/", "l")
("|", "i"); ("|", "l")
("!", "i"); ("!", "l")
("+", "t")
("@", "a")
("$", "s")
("&", "b")
("(", "c")
("[", "c")|]
|> Array.map itemToConfusable
let confusables =
obsfucationConfusables
|> Array.append unicodeConfusables
let listToCodePoints = List.map (String.toCodePoints >> Seq.toArray)
let filters = ["nope"; "fail"; "leet"] |> listToCodePoints
let terms = ["ℕope"; "𝑵ope"; "ռope"; "nope"; "𝕱ail"; "𝓕ail"; "pass"; "𝕿rue"; "𝓽𝓻𝓾𝒆"; "l33t"; "1337"; "noope"; "failing"] |> listToCodePoints
let findCandidates codePoint = confusables |> Array.filter (fun x -> x.Original = codePoint)
let any = (<) 0
let rec getCombinations<'a> (input : 'a array array) : 'a array array =
let currentArray = input |> Array.head
let tail = input |> Array.tail
if tail |> Array.length |> any then
tail
|> getCombinations
|> Array.map (fun ia ->
currentArray
|> Array.map (fun ca ->
[|[|ca|]; ia|]
|> Array.flatten))
|> Array.flatten
else currentArray |> Array.map Array.initOne
let transformTerm =
Array.map (fun codePoint ->
match codePoint |> findCandidates with
| [||] -> [|[|codePoint|]|]
| candidates -> candidates |> Array.map (fun x -> x.Replacement) |> Array.append [|[|codePoint|]|])
>> getCombinations
>> Array.map Array.flatten
let lowerCaseTerm = Array.map (function | c when [('A' |> int)..('Z' |> int)] |> List.contains c -> c + 32 | c -> c)
//let transformToLowerTerm = transformTerm >> lowerCaseTerm
let transformToLowerTerm = transformTerm >> Array.map lowerCaseTerm
//let matchedFilters term = filters |> List.filter (term |> transformToLowerTerm |> (=))
//let matchedFilters term =
// filters
// |> List.filter (fun filter ->
// let transformations = term |> transformToLowerTerm
// let anyMatch = transformations |> Array.filter ((=) filter)
// anyMatch |> Array.length |> any)
let matchFilters filters (term : int[]) =
let matchFilter (filter : int[]) =
let concated = term |> Array.append filter
let grouped = concated |> Array.groupBy id |> Array.map (fun (c, a) -> (c, a |> Array.length))
let sum = grouped |> Array.fold (fun acc (c, i) -> acc + i / 2) 0 |> float
if sum <= 0.0 then None
else (filter, sum / (max (term.Length |> float) (filter.Length |> float))) |> Some
filters
|> Seq.ofList
|> Seq.choose matchFilter
let bestFilter filters =
matchFilters filters >> Seq.sortByDescending snd >> Seq.tryHead
let meetsThreshold threshold (filter, percent) = percent > threshold
let matchedFilters term =
term
|> transformToLowerTerm
|> Array.map (bestFilter filters >> Option.bindNone ([||], 0.0))
|> Array.sortByDescending snd
|> Array.filter (meetsThreshold 0.5)
|> Array.tryHead
|> (function | None -> [] | Some a -> [a])
let combinationTest = [|[|'g'|]; [|'r'|]; [|'e'; 'a'|]; [|'y'|]|] |> getCombinations |> Array.map String.implode
let matchedTerms =
terms
|> List.map (fun t -> (t, t |> matchedFilters))
|> List.filter (snd >> List.length >> any)
|> List.map (fun (t, f) -> (t |> String.fromCodePoints, f |> List.map (fun (f, s) -> (f |> String.fromCodePoints, s))))
//terms |> List.map (transformToLowerTerm >> String.fromCodePoints)
terms |> List.map (transformToLowerTerm >> Array.map String.fromCodePoints)
module Tests =
module Assert =
let private fn fn msg expected actual =
if fn expected actual then None
else (msg expected actual) |> Some
let equal expected actual = fn (=) (sprintf "expected: %A; actual: %A") expected actual
let notEqual expected actual = fn (<>) (sprintf "expected not: %A; actual: %A") expected actual
let largerThan expected actual = fn (>) (sprintf "expected greater than: %A; actual %A") expected actual
let largerEqual expected actual = fn (>=) (sprintf "expected greater than / equal to: %A; actual %A") expected actual
let smallerThan expected actual = fn (<) (sprintf "expected smaller than: %A; actual %A") expected actual
let smallerEqual expected actual = fn (<=) (sprintf "expected smaller than / equal to: %A; actual %A") expected actual
let print = function | None -> printfn "Pass" | Some msg -> printfn "Fail: %s" msg
let matchFilters filters (term : string) =
let matchFilter (filter : string) =
let termChars = term.ToCharArray()
let filterChars = filter.ToCharArray()
let concated = termChars |> Array.append filterChars
let grouped = concated |> Array.groupBy id |> Array.map (fun (c, a) -> (c, a |> Array.length))
let sum = grouped |> Array.fold (fun acc (c, i) -> acc + i / 2) 0 |> float |> (*) 2.0
if sum <= 0.0 then None
else (filter, sum / (term.Length + filter.Length |> float)) |> Some
filters
|> Seq.ofList
|> Seq.choose matchFilter
let bestFilter filters =
matchFilters filters >> Seq.sortByDescending snd >> Seq.tryHead >> Option.map snd
let meetsThreshold threshold (filter, percent) = percent > threshold
let testFn filters = bestFilter filters >> Option.bindNone 0.0
let ``A term that exactly matches a word in the filter should return 1.0 as match`` () =
let expected = 1.0
let inputTerm = "nope"
let inputFilter = ["nope"]
let actual = (inputFilter, inputTerm) ||> testFn
Assert.equal expected actual
let ``A term that does not match any words in the filter should return no matches`` () =
let expected = 0.0
let inputTerm = "abcd"
let inputFilter = ["nope"]
let actual = (inputFilter, inputTerm) ||> testFn
Assert.equal expected actual
let ``A term that partially matches a word in the filter should return 0.* as match`` () =
let expected = 0.25
let inputTerm = "true"
let inputFilter = ["nope"]
let actual = (inputFilter, inputTerm) ||> testFn
Assert.equal expected actual
let ``A term that mostly matches a word in the filter should return 0.* as match`` () =
let expected = 0.75
let inputTerm = "mope"
let inputFilter = ["nope"]
let actual = (inputFilter, inputTerm) ||> testFn
Assert.equal expected actual
let runTests () =
let tests = [``A term that exactly matches a word in the filter should return 1.0 as match``
``A term that does not match any words in the filter should return no matches``
``A term that partially matches a word in the filter should return 0.* as match``
``A term that mostly matches a word in the filter should return 0.* as match``]
tests |> List.iter ((|>) () >> Assert.print)
Now, this file I’ve been working in is 194 lines or so, with several bits of commented code (no longer used, kept it because why not?), things that really should have been extracted away a while ago. Today, we’re going to do that.
Because it’s extremely important to do this (and on a regular basis), I’m going to go through the entire process, which I usually perform as the following steps:
- Identify related, stable bits of code that make sense together;
- Extract these to a new module/function/what-have-you;
- Test your implementation again, refactor anything necessary;
- Extract these to a new file in a .dll, build the .dll and reference it in the FSX (script) file;
- Test your implementation again, refactor anything necessary;
- Repeat for the next section of code;
By the time we get done we’ll have this FSX file down to a few lines (40 or so), we’re going to parameterize everything necessary, and build a new .dll solution.
Let’s get started
So we’ll begin with the easiest section: our Tests
module. This is really already setup exactly how we want: a module with independent parts that we can import together or in whole.
To do this we’ll create an F# Library project, I’ve added references to my F# Extensions project from GitHub, as we’ll use some of the things in it.
Once we have the F# project created, we’ll delete Library1.fs
— it’s unnecessary, we’ll add a new .fs (F# Source File) and call it Tests
, then finally, we’ll cut and paste our module Tests
code from our script. We only need to delete the equal sign after the word Tests
and then everything is done for this part.
The other portions won’t be so easy — we’ll have to rewrite some of our code to work. Our next step is to take some of the Unicode stuff we did and move it to a new module. We want to define a module Unicode =
in our current script file, and begin moving stuff in there, starting with type UnicodeConfusables
.
By the time we finish with the Unicode stuff we should end up with something like:
module Unicode =
type UnicodeConfusables = { Original : int; Replacement : int array; Field3 : string }
let private file = "http://www.unicode.org/Public/security/10.0.0/confusables.txt"
let private getData url =
let file = url |> Option.bindNone file
use wc = new WebClient()
wc.DownloadString(file)
let private mapToConfusable =
function
| [|a; b; c|] ->
{ Original = a |> String.trim |> Int.fromHex
Replacement = b |> String.trim |> String.split ' ' |> Array.map Int.fromHex
Field3 = c |> String.trim } |> Some
| _ -> None
let getConfusables url =
getData url
|> String.split '\n'
|> Seq.filter (String.startsWith "#" >> not)
|> Seq.map (Seq.takeWhile ((<>) '#') >> Seq.toArray >> String.implode >> String.split ';')
|> Seq.choose mapToConfusable
|> Seq.toArray
Now I’ve modified this to allow us to specify an alternate URL to download if we decide, which means we can use a new version of the confusables.txt
if/when there is one, or we can cache it locally.
Next step: terms and filters
Whenever we have a bunch of functions that have the same prefix or suffix, there’s a good chance they belong together. If we look at our code, we have transformTerm
, lowerCaseTerm
, and transformToLowerTerm
. All three of these are transformations on a Term
, so let’s modularize them:
module Term =
let transform =
Array.map (fun codePoint ->
match codePoint |> findCandidates with
| [||] -> [|[|codePoint|]|]
| candidates -> candidates |> Array.map (fun x -> x.Replacement) |> Array.append [|[|codePoint|]|])
>> getCombinations
>> Array.map Array.concat
let lowerCase = Array.map (function | c when [('A' |> int)..('Z' |> int)] |> List.contains c -> c + 32 | c -> c)
let transformToLower = transform >> Array.map lowerCase
Now we’re getting somewhere. A lot of this stuff no longer needs touched, so it’s easier on us if we just put it in a .fs
file and push it out of our current working area.
Of course, now we realize we need the findCandidates
function in the Term
module, as well as getCombinations
. So the next thing we’ll do is move those to more usable locations.
The findCandidates
function can belong in Term
, as the module interacts with the confusables to find candidate confusable. It really doesn’t belong in the Unicode
module…or does it? As we look at it, we realize that findCandidates
can totally belong in the Unicode
module: it needs to know all about confusables, and it cares only about that.
It doesn’t matter (much) where you put it, but I put it in my Term
module. Often times you’ll have several choices of what to do with something, and it’s up to you to make the right choice.
Next we need getCombinations
, which we built to be pretty generic. I’m actually going to put an almost identical version of this in my F# Extensions project (linked above), which you can use. You can also drop this into your own project if you like.
Filters, which are actually quite easy
Just like before we’ll want to create a Filter
module and drop the appropriate bits in there. Here’s what you’ll want to do:
- Rename the functions to fit better;
- Modify
matchedFilters
to takeconfusables,
filters
, and thethreshold
as parameters, in that order, then theterm
itself;
We do the second point because it makes the most sense, if term
is the last parameter we can curry this very well:
let filter = Filter.matched confusables filters 0.5
let matchedTerms =
terms
|> List.map (fun t -> (t, t |> filter))
|> List.filter (snd >> List.length >> any)
|> List.map (fun (t, f) -> (t |> String.fromCodePoints, f |> List.map (fun (f, s) -> (f |> String.fromCodePoints, s))))
We set the confusables as the first parameter because they’re likely to want to be swapped the least. Then, between threshold
and filters
, I can see changing threshold
more often than filters
.
Finally, we should be down to a 50-or-so line script:
#I "bin\\Release\\"
#r "FSharpExtensions.dll"
#r "FSharpExtensions.Applications.dll"
#load "Unicode.fs"
#load "Term.fs"
#load "Filter.fs"
#load "Tests.fs"
let listToCodePoints = List.map (String.toCodePoints >> Seq.toArray)
let obsfucationConfusables =
let itemToConfusable (orig, repl) =
{ Unicode.UnicodeConfusables.Original = (orig, 0) ||> Char.toCodePoint
Unicode.UnicodeConfusables.Replacement = repl |> String.toCodePoints |> Seq.toArray
Unicode.UnicodeConfusables.Field3 = "OBS" }
[|("1", "i"); ("1", "l")
("2", "z"); ("2", "s")
("3", "e")
("4", "a")
("5", "s"); ("5", "z")
("6", "g"); ("6", "b")
("7", "t")
("8", "b")
("9", "g")
("0", "o")
("\\", "i"); ("\\", "l")
("/", "i"); ("/", "l")
("|", "i"); ("|", "l")
("!", "i"); ("!", "l")
("+", "t")
("@", "a")
("$", "s")
("&", "b")
("(", "c")
("[", "c")|]
|> Array.map itemToConfusable
let confusables = obsfucationConfusables |> Array.append (Unicode.getConfusables None)
let filters = ["nope"; "fail"; "leet"] |> listToCodePoints
let terms = ["ℕope"; "𝑵ope"; "ռope"; "nope"; "𝕱ail"; "𝓕ail"; "pass"; "𝕿rue"; "𝓽𝓻𝓾𝒆"; "l33t"; "1337"; "noope"; "failing"] |> listToCodePoints
let any = (<) 0
let combinationTest = [|[|'g'|]; [|'r'|]; [|'e'; 'a'|]; [|'y'|]|] |> Array.getCombinations |> Array.map String.implode
let filter = Filter.matched confusables filters 0.5
let matchedTerms =
terms
|> List.map (fun t -> (t, t |> filter))
|> List.filter (snd >> List.length >> any)
|> List.map (fun (t, f) -> (t |> String.fromCodePoints, f |> List.map (fun (f, s) -> (f |> String.fromCodePoints, s))))
terms |> List.map (Term.transformToLower confusables >> Array.map String.fromCodePoints)
What bothers me here are the obsfucationConfusables
, we really ought to have a better way to do that, and as it turns out, we do.
This is where having url
as a parameter in the Unicode.getConfusables
function is very useful: we can build a file out for our obsfucationConfusables
and map them with the same getConfusables
function. We’ll create a text file with the following content:
0031 ; 0069 ; OBS
0031 ; 006C ; OBS
0032 ; 007A ; OBS
0032 ; 0073 ; OBS
0033 ; 0065 ; OBS
0034 ; 0061 ; OBS
0035 ; 0073 ; OBS
0035 ; 007A ; OBS
0036 ; 0067 ; OBS
0036 ; 0062 ; OBS
0037 ; 0074 ; OBS
0038 ; 0062 ; OBS
0039 ; 0067 ; OBS
0030 ; 006F ; OBS
005C ; 0069 ; OBS
005C ; 006C ; OBS
002F ; 0069 ; OBS
002F ; 006C ; OBS
007C ; 0069 ; OBS
007C ; 006C ; OBS
0021 ; 0069 ; OBS
0021 ; 006C ; OBS
002B ; 0074 ; OBS
0040 ; 0061 ; OBS
0024 ; 0073 ; OBS
0026 ; 0062 ; OBS
0028 ; 0063 ; OBS
005B ; 0063 ; OBS
Finally, we make all this work in our new script file, and we should have something like the following:
#I "bin\\Release\\"
#r "FSharpExtensions.dll"
#r "FSharpExtensions.Applications.dll"
#load "Unicode.fs"
#load "Term.fs"
#load "Filter.fs"
#load "Tests.fs"
let listToCodePoints = List.map (String.toCodePoints >> Seq.toArray)
let confusables =
__SOURCE_DIRECTORY__ + "\\ObsfucationConfusables.txt"
|> Some
|> Unicode.getConfusables
|> Array.append (Unicode.getConfusables None)
let filters = ["nope"; "fail"; "leet"] |> listToCodePoints
let terms = ["ℕope"; "𝑵ope"; "ռope"; "nope"; "𝕱ail"; "𝓕ail"; "pass"; "𝕿rue"; "𝓽𝓻𝓾𝒆"; "l33t"; "1337"; "noope"; "failing"] |> listToCodePoints
let any = (<) 0
let filter = Filter.matched confusables filters 0.5
let matchedTerms =
terms
|> List.map (fun t -> (t, t |> filter))
|> List.filter (snd >> List.length >> any)
|> List.map (fun (t, f) -> (t |> String.fromCodePoints, f |> List.map (fun (f, s) -> (f |> String.fromCodePoints, s))))
terms |> List.map (Term.transformToLower confusables >> Array.map String.fromCodePoints)
Isn’t that much cleaner? I think so. It also prepares us for making an actual program out of this, we know what points can be built out more, what needs parameterized further, and how to continue down our road.
Finally, fix the tests (homework)
At the moment our Tests
module uses the functions inside it, and doesn’t test our live implementations — we really out to fix that.
I leave the majority of the implementation up to you, but you’ll want to start by rewriting testFn
to work with Filter.best
, and move from there. It’s not hard, you only have to redefine testFn
, and optionally define an additional function. I’ll give you that one for free (since it’s so simple):
let mapStr = String.toCodePoints >> Seq.toArray
This takes string -> int []
. (To convert the sample term / filter into codepoints.)
So that was it — we gave ourselves a code-review and I obviously failed, but the new, corrected version should pass. Our next step will be to start modifying this to post an actual message. We want to make sure that we can take a message in and process it, match each word against a term, and then filter everything down to “Spam” or “Not Spam”. This is a lot harder than it sounds, so be aware that it will not be an easy task. We’ll also gradually expand our filter to support more features, and eventually have an enterprise-grade filter ready to use.