Getting Started with Programming and getting absolutely nowhere: Part 19


Cleaning up our previous work

Developing software has a lot of fun parts: solving problems, designing a system, experimenting with what you have finished so far…it has a lot of interesting and delightful things to be done. Unfortunately (or fortunately, depending on how you look at it), developing software also has a couple less delightful parts: debugging, testing, and least of all, cleaning your implementation.

Today we’re going to cover cleaning up our previous (mess) of code, I’m going to post the file I’ve been working with below, but if you’ve been following along you probably have one just as big of a mess:

#I "bin\\Release\\"
#r "Extensions.dll"
open System.Net

type UnicodeConfusables = { Original : int; Replacement : int array; Field3 : string }

let mapToConfusable =
    function
    | [|a; b; c|] ->
        { Original = a |> String.trim |> Int.fromHex
          Replacement = b |> String.trim |> String.split ' ' |> Array.map Int.fromHex
          Field3 = c |> String.trim } |> Some
    | _ -> None

let file = "http://www.unicode.org/Public/security/10.0.0/confusables.txt"
let data =
    use wc = new WebClient()
    wc.DownloadString(file)

let unicodeConfusables =
    data
    |> String.split '\n'
    |> Seq.filter (String.startsWith "#" >> not)
    |> Seq.map (Seq.takeWhile ((<>) '#') >> Seq.toArray >> String.implode >> String.split ';')
    |> Seq.choose mapToConfusable
    |> Seq.toArray
let obsfucationConfusables =
    let itemToConfusable (orig, repl) =
        { Original = (orig, 0) ||> Char.toCodePoint
          Replacement = repl |> String.toCodePoints |> Seq.toArray
          Field3 = "OBS" }
    [|("1", "i"); ("1", "l")
      ("2", "z"); ("2", "s")
      ("3", "e")
      ("4", "a")
      ("5", "s"); ("5", "z")
      ("6", "g"); ("6", "b")
      ("7", "t")
      ("8", "b")
      ("9", "g")
      ("0", "o")
      ("\\", "i"); ("\\", "l")
      ("/", "i"); ("/", "l")
      ("|", "i"); ("|", "l")
      ("!", "i"); ("!", "l")
      ("+", "t")
      ("@", "a")
      ("$", "s")
      ("&", "b")
      ("(", "c")
      ("[", "c")|]
    |> Array.map itemToConfusable

let confusables =
    obsfucationConfusables
    |> Array.append unicodeConfusables

let listToCodePoints = List.map (String.toCodePoints >> Seq.toArray)

let filters = ["nope"; "fail"; "leet"] |> listToCodePoints
let terms = ["ℕope"; "𝑵ope"; "ռope"; "nope"; "𝕱ail"; "𝓕ail"; "pass"; "𝕿rue"; "𝓽𝓻𝓾𝒆"; "l33t"; "1337"; "noope"; "failing"] |> listToCodePoints

let findCandidates codePoint = confusables |> Array.filter (fun x -> x.Original = codePoint)

let any = (<) 0

let rec getCombinations<'a> (input : 'a array array) : 'a array array =
    let currentArray = input |> Array.head
    let tail = input |> Array.tail
    if tail |> Array.length |> any then
        tail
        |> getCombinations
        |> Array.map (fun ia ->
            currentArray
            |> Array.map (fun ca ->
                [|[|ca|]; ia|]
                |> Array.flatten))
        |> Array.flatten
    else currentArray |> Array.map Array.initOne

let transformTerm =
    Array.map (fun codePoint ->
        match codePoint |> findCandidates with
        | [||] -> [|[|codePoint|]|]
        | candidates -> candidates |> Array.map (fun x -> x.Replacement) |> Array.append [|[|codePoint|]|])
    >> getCombinations
    >> Array.map Array.flatten

let lowerCaseTerm = Array.map (function | c when [('A' |> int)..('Z' |> int)] |> List.contains c -> c + 32 | c -> c)

//let transformToLowerTerm = transformTerm >> lowerCaseTerm
let transformToLowerTerm = transformTerm >> Array.map lowerCaseTerm
//let matchedFilters term = filters |> List.filter (term |> transformToLowerTerm |> (=))
//let matchedFilters term =
//    filters
//    |> List.filter (fun filter ->
//        let transformations = term |> transformToLowerTerm
//        let anyMatch = transformations |> Array.filter ((=) filter)
//        anyMatch |> Array.length |> any)

let matchFilters filters (term : int[]) =
    let matchFilter (filter : int[]) =
        let concated = term |> Array.append filter
        let grouped = concated |> Array.groupBy id |> Array.map (fun (c, a) -> (c, a |> Array.length))
        let sum = grouped |> Array.fold (fun acc (c, i) -> acc + i / 2) 0 |> float
        if sum <= 0.0 then None
        else (filter, sum / (max (term.Length |> float) (filter.Length |> float))) |> Some
    filters
    |> Seq.ofList
    |> Seq.choose matchFilter
let bestFilter filters =
    matchFilters filters >> Seq.sortByDescending snd >> Seq.tryHead
let meetsThreshold threshold (filter, percent) = percent > threshold

let matchedFilters term =
    term
    |> transformToLowerTerm
    |> Array.map (bestFilter filters >> Option.bindNone ([||], 0.0))
    |> Array.sortByDescending snd
    |> Array.filter (meetsThreshold 0.5)
    |> Array.tryHead
    |> (function | None -> [] | Some a -> [a])

let combinationTest = [|[|'g'|]; [|'r'|]; [|'e'; 'a'|]; [|'y'|]|] |> getCombinations |> Array.map String.implode

let matchedTerms =
    terms
    |> List.map (fun t -> (t, t |> matchedFilters))
    |> List.filter (snd >> List.length >> any)
    |> List.map (fun (t, f) -> (t |> String.fromCodePoints, f |> List.map (fun (f, s) -> (f |> String.fromCodePoints, s))))
//terms |> List.map (transformToLowerTerm >> String.fromCodePoints)
terms |> List.map (transformToLowerTerm >> Array.map String.fromCodePoints)

module Tests =
    module Assert =
        let private fn fn msg expected actual =
            if fn expected actual then None
            else (msg expected actual) |> Some
        let equal expected actual = fn (=) (sprintf "expected: %A; actual: %A") expected actual
        let notEqual expected actual = fn (<>) (sprintf "expected not: %A; actual: %A") expected actual
        let largerThan expected actual = fn (>) (sprintf "expected greater than: %A; actual %A") expected actual
        let largerEqual expected actual = fn (>=) (sprintf "expected greater than / equal to: %A; actual %A") expected actual
        let smallerThan expected actual = fn (<) (sprintf "expected smaller than: %A; actual %A") expected actual
        let smallerEqual expected actual = fn (<=) (sprintf "expected smaller than / equal to: %A; actual %A") expected actual
        let print = function | None -> printfn "Pass" | Some msg -> printfn "Fail: %s" msg

    let matchFilters filters (term : string) =
        let matchFilter (filter : string) =
            let termChars = term.ToCharArray()
            let filterChars = filter.ToCharArray()
            let concated = termChars |> Array.append filterChars
            let grouped = concated |> Array.groupBy id |> Array.map (fun (c, a) -> (c, a |> Array.length))
            let sum = grouped |> Array.fold (fun acc (c, i) -> acc + i / 2) 0 |> float |> (*) 2.0
            if sum <= 0.0 then None
            else (filter, sum / (term.Length + filter.Length |> float)) |> Some
        filters
        |> Seq.ofList
        |> Seq.choose matchFilter
    let bestFilter filters =
        matchFilters filters >> Seq.sortByDescending snd >> Seq.tryHead >> Option.map snd
    let meetsThreshold threshold (filter, percent) = percent > threshold

    let testFn filters = bestFilter filters >> Option.bindNone 0.0

    let ``A term that exactly matches a word in the filter should return 1.0 as match`` () =
        let expected = 1.0
        let inputTerm = "nope"
        let inputFilter = ["nope"]
        let actual = (inputFilter, inputTerm) ||> testFn
        Assert.equal expected actual

    let ``A term that does not match any words in the filter should return no matches`` () =
        let expected = 0.0
        let inputTerm = "abcd"
        let inputFilter = ["nope"]
        let actual = (inputFilter, inputTerm) ||> testFn
        Assert.equal expected actual
    
    let ``A term that partially matches a word in the filter should return 0.* as match`` () =
        let expected = 0.25
        let inputTerm = "true"
        let inputFilter = ["nope"]
        let actual = (inputFilter, inputTerm) ||> testFn
        Assert.equal expected actual

    let ``A term that mostly matches a word in the filter should return 0.* as match`` () =
        let expected = 0.75
        let inputTerm = "mope"
        let inputFilter = ["nope"]
        let actual = (inputFilter, inputTerm) ||> testFn
        Assert.equal expected actual

    let runTests () =
        let tests = [``A term that exactly matches a word in the filter should return 1.0 as match``
                     ``A term that does not match any words in the filter should return no matches``
                     ``A term that partially matches a word in the filter should return 0.* as match``
                     ``A term that mostly matches a word in the filter should return 0.* as match``]
        tests |> List.iter ((|>) () >> Assert.print)

Now, this file I’ve been working in is 194 lines or so, with several bits of commented code (no longer used, kept it because why not?), things that really should have been extracted away a while ago. Today, we’re going to do that.

Because it’s extremely important to do this (and on a regular basis), I’m going to go through the entire process, which I usually perform as the following steps:

  1. Identify related, stable bits of code that make sense together;
  2. Extract these to a new module/function/what-have-you;
  3. Test your implementation again, refactor anything necessary;
  4. Extract these to a new file in a .dll, build the .dll and reference it in the FSX (script) file;
  5. Test your implementation again, refactor anything necessary;
  6. Repeat for the next section of code;

By the time we get done we’ll have this FSX file down to a few lines (40 or so), we’re going to parameterize everything necessary, and build a new .dll solution.

Let’s get started

So we’ll begin with the easiest section: our Tests module. This is really already setup exactly how we want: a module with independent parts that we can import together or in whole.

To do this we’ll create an F# Library project, I’ve added references to my F# Extensions project from GitHub, as we’ll use some of the things in it.

Once we have the F# project created, we’ll delete Library1.fs — it’s unnecessary, we’ll add a new .fs (F# Source File) and call it Tests, then finally, we’ll cut and paste our module Tests code from our script. We only need to delete the equal sign after the word Tests and then everything is done for this part.

The other portions won’t be so easy — we’ll have to rewrite some of our code to work. Our next step is to take some of the Unicode stuff we did and move it to a new module. We want to define a module Unicode = in our current script file, and begin moving stuff in there, starting with type UnicodeConfusables.

By the time we finish with the Unicode stuff we should end up with something like:

module Unicode =
    type UnicodeConfusables = { Original : int; Replacement : int array; Field3 : string }

    let private file = "http://www.unicode.org/Public/security/10.0.0/confusables.txt"
    let private getData url =
        let file = url |> Option.bindNone file
        use wc = new WebClient()
        wc.DownloadString(file)

    let private mapToConfusable =
        function
        | [|a; b; c|] ->
            { Original = a |> String.trim |> Int.fromHex
              Replacement = b |> String.trim |> String.split ' ' |> Array.map Int.fromHex
              Field3 = c |> String.trim } |> Some
        | _ -> None

    let getConfusables url =
        getData url
        |> String.split '\n'
        |> Seq.filter (String.startsWith "#" >> not)
        |> Seq.map (Seq.takeWhile ((<>) '#') >> Seq.toArray >> String.implode >> String.split ';')
        |> Seq.choose mapToConfusable
        |> Seq.toArray

Now I’ve modified this to allow us to specify an alternate URL to download if we decide, which means we can use a new version of the confusables.txt if/when there is one, or we can cache it locally.

Next step: terms and filters

Whenever we have a bunch of functions that have the same prefix or suffix, there’s a good chance they belong together. If we look at our code, we have transformTerm, lowerCaseTerm, and transformToLowerTerm. All three of these are transformations on a Term, so let’s modularize them:

module Term =
    let transform =
        Array.map (fun codePoint ->
            match codePoint |> findCandidates with
            | [||] -> [|[|codePoint|]|]
            | candidates -> candidates |> Array.map (fun x -> x.Replacement) |> Array.append [|[|codePoint|]|])
        >> getCombinations
        >> Array.map Array.concat

    let lowerCase = Array.map (function | c when [('A' |> int)..('Z' |> int)] |> List.contains c -> c + 32 | c -> c)
    let transformToLower = transform >> Array.map lowerCase

Now we’re getting somewhere. A lot of this stuff no longer needs touched, so it’s easier on us if we just put it in a .fs file and push it out of our current working area.

Of course, now we realize we need the findCandidates function in the Term module, as well as getCombinations. So the next thing we’ll do is move those to more usable locations.

The findCandidates function can belong in Term, as the module interacts with the confusables to find candidate confusable. It really doesn’t belong in the Unicode module…or does it? As we look at it, we realize that findCandidates can totally belong in the Unicode module: it needs to know all about confusables, and it cares only about that.

It doesn’t matter (much) where you put it, but I put it in my Term module. Often times you’ll have several choices of what to do with something, and it’s up to you to make the right choice.

Next we need getCombinations, which we built to be pretty generic. I’m actually going to put an almost identical version of this in my F# Extensions project (linked above), which you can use. You can also drop this into your own project if you like.

Filters, which are actually quite easy

Just like before we’ll want to create a Filter module and drop the appropriate bits in there. Here’s what you’ll want to do:

  • Rename the functions to fit better;
  • Modify matchedFilters to take confusables, filters, and the threshold as parameters, in that order, then the term itself;

We do the second point because it makes the most sense, if term is the last parameter we can curry this very well:

let filter = Filter.matched confusables filters 0.5
let matchedTerms =
    terms
    |> List.map (fun t -> (t, t |> filter))
    |> List.filter (snd >> List.length >> any)
    |> List.map (fun (t, f) -> (t |> String.fromCodePoints, f |> List.map (fun (f, s) -> (f |> String.fromCodePoints, s))))

We set the confusables as the first parameter because they’re likely to want to be swapped the least. Then, between threshold and filters, I can see changing threshold more often than filters.

Finally, we should be down to a 50-or-so line script:

#I "bin\\Release\\"
#r "FSharpExtensions.dll"
#r "FSharpExtensions.Applications.dll"

#load "Unicode.fs"
#load "Term.fs"
#load "Filter.fs"
#load "Tests.fs"

let listToCodePoints = List.map (String.toCodePoints >> Seq.toArray)

let obsfucationConfusables =
    let itemToConfusable (orig, repl) =
        { Unicode.UnicodeConfusables.Original = (orig, 0) ||> Char.toCodePoint
          Unicode.UnicodeConfusables.Replacement = repl |> String.toCodePoints |> Seq.toArray
          Unicode.UnicodeConfusables.Field3 = "OBS" }
    [|("1", "i"); ("1", "l")
      ("2", "z"); ("2", "s")
      ("3", "e")
      ("4", "a")
      ("5", "s"); ("5", "z")
      ("6", "g"); ("6", "b")
      ("7", "t")
      ("8", "b")
      ("9", "g")
      ("0", "o")
      ("\\", "i"); ("\\", "l")
      ("/", "i"); ("/", "l")
      ("|", "i"); ("|", "l")
      ("!", "i"); ("!", "l")
      ("+", "t")
      ("@", "a")
      ("$", "s")
      ("&", "b")
      ("(", "c")
      ("[", "c")|]
    |> Array.map itemToConfusable

let confusables = obsfucationConfusables |> Array.append (Unicode.getConfusables None)
let filters = ["nope"; "fail"; "leet"] |> listToCodePoints
let terms = ["ℕope"; "𝑵ope"; "ռope"; "nope"; "𝕱ail"; "𝓕ail"; "pass"; "𝕿rue"; "𝓽𝓻𝓾𝒆"; "l33t"; "1337"; "noope"; "failing"] |> listToCodePoints

let any = (<) 0

let combinationTest = [|[|'g'|]; [|'r'|]; [|'e'; 'a'|]; [|'y'|]|] |> Array.getCombinations |> Array.map String.implode

let filter = Filter.matched confusables filters 0.5

let matchedTerms =
    terms
    |> List.map (fun t -> (t, t |> filter))
    |> List.filter (snd >> List.length >> any)
    |> List.map (fun (t, f) -> (t |> String.fromCodePoints, f |> List.map (fun (f, s) -> (f |> String.fromCodePoints, s))))
terms |> List.map (Term.transformToLower confusables >> Array.map String.fromCodePoints)

What bothers me here are the obsfucationConfusables, we really ought to have a better way to do that, and as it turns out, we do.

This is where having url as a parameter in the Unicode.getConfusables function is very useful: we can build a file out for our obsfucationConfusables and map them with the same getConfusables function. We’ll create a text file with the following content:

0031 ; 0069 ; OBS
0031 ; 006C ; OBS
0032 ; 007A ; OBS
0032 ; 0073 ; OBS
0033 ; 0065 ; OBS
0034 ; 0061 ; OBS
0035 ; 0073 ; OBS
0035 ; 007A ; OBS
0036 ; 0067 ; OBS
0036 ; 0062 ; OBS
0037 ; 0074 ; OBS
0038 ; 0062 ; OBS
0039 ; 0067 ; OBS
0030 ; 006F ; OBS
005C ; 0069 ; OBS
005C ; 006C ; OBS
002F ; 0069 ; OBS
002F ; 006C ; OBS
007C ; 0069 ; OBS
007C ; 006C ; OBS
0021 ; 0069 ; OBS
0021 ; 006C ; OBS
002B ; 0074 ; OBS
0040 ; 0061 ; OBS
0024 ; 0073 ; OBS
0026 ; 0062 ; OBS
0028 ; 0063 ; OBS
005B ; 0063 ; OBS

Finally, we make all this work in our new script file, and we should have something like the following:

#I "bin\\Release\\"
#r "FSharpExtensions.dll"
#r "FSharpExtensions.Applications.dll"

#load "Unicode.fs"
#load "Term.fs"
#load "Filter.fs"
#load "Tests.fs"

let listToCodePoints = List.map (String.toCodePoints >> Seq.toArray)

let confusables =
    __SOURCE_DIRECTORY__ + "\\ObsfucationConfusables.txt"
    |> Some
    |> Unicode.getConfusables
    |> Array.append (Unicode.getConfusables None)
let filters = ["nope"; "fail"; "leet"] |> listToCodePoints
let terms = ["ℕope"; "𝑵ope"; "ռope"; "nope"; "𝕱ail"; "𝓕ail"; "pass"; "𝕿rue"; "𝓽𝓻𝓾𝒆"; "l33t"; "1337"; "noope"; "failing"] |> listToCodePoints

let any = (<) 0
let filter = Filter.matched confusables filters 0.5

let matchedTerms =
    terms
    |> List.map (fun t -> (t, t |> filter))
    |> List.filter (snd >> List.length >> any)
    |> List.map (fun (t, f) -> (t |> String.fromCodePoints, f |> List.map (fun (f, s) -> (f |> String.fromCodePoints, s))))
terms |> List.map (Term.transformToLower confusables >> Array.map String.fromCodePoints)

Isn’t that much cleaner? I think so. It also prepares us for making an actual program out of this, we know what points can be built out more, what needs parameterized further, and how to continue down our road.

Finally, fix the tests (homework)

At the moment our Tests module uses the functions inside it, and doesn’t test our live implementations — we really out to fix that.

I leave the majority of the implementation up to you, but you’ll want to start by rewriting testFn to work with Filter.best, and move from there. It’s not hard, you only have to redefine testFn, and optionally define an additional function. I’ll give you that one for free (since it’s so simple):

let mapStr = String.toCodePoints >> Seq.toArray

This takes string -> int []. (To convert the sample term / filter into codepoints.)


So that was it — we gave ourselves a code-review and I obviously failed, but the new, corrected version should pass. Our next step will be to start modifying this to post an actual message. We want to make sure that we can take a message in and process it, match each word against a term, and then filter everything down to “Spam” or “Not Spam”. This is a lot harder than it sounds, so be aware that it will not be an easy task. We’ll also gradually expand our filter to support more features, and eventually have an enterprise-grade filter ready to use.

, ,

Leave a Reply

Your email address will not be published. Required fields are marked *