using Programming;

A Blog about some of the intrinsics related to programming and how one can get the best out of various languages.

Getting started with programming and getting absolutely nowhere (Part 19)

Cleaning up our previous work

Lesson 18: Documentation: it's really important

Developing software has a lot of fun parts: solving problems, designing a system, experimenting with what you have finished so far...it has a lot of interesting and delightful things to be done. Unfortunately (or fortunately, depending on how you look at it), developing software also has a couple less delightful parts: debugging, testing, and least of all, cleaning your implementation.

Today we're going to cover cleaning up our previous (mess) of code, I'm going to post the file I've been working with below, but if you've been following along you probably have one just as big of a mess:

#I "bin\\Release\\"
#r "Extensions.dll"
open System.Net

type UnicodeConfusables = { Original : int; Replacement : int array; Field3 : string }

let mapToConfusable =
    function
    | [|a; b; c|] ->
        { Original = a |> String.trim |> Int.fromHex
          Replacement = b |> String.trim |> String.split ' ' |> Array.map Int.fromHex
          Field3 = c |> String.trim } |> Some
    | _ -> None

let file = "http://www.unicode.org/Public/security/10.0.0/confusables.txt"
let data =
    use wc = new WebClient()
    wc.DownloadString(file)

let unicodeConfusables =
    data
    |> String.split '\n'
    |> Seq.filter (String.startsWith "#" >> not)
    |> Seq.map (Seq.takeWhile ((<>) '#') >> Seq.toArray >> String.implode >> String.split ';')
    |> Seq.choose mapToConfusable
    |> Seq.toArray
let obsfucationConfusables =
    let itemToConfusable (orig, repl) =
        { Original = (orig, 0) ||> Char.toCodePoint
          Replacement = repl |> String.toCodePoints |> Seq.toArray
          Field3 = "OBS" }
    [|("1", "i"); ("1", "l")
      ("2", "z"); ("2", "s")
      ("3", "e")
      ("4", "a")
      ("5", "s"); ("5", "z")
      ("6", "g"); ("6", "b")
      ("7", "t")
      ("8", "b")
      ("9", "g")
      ("0", "o")
      ("\\", "i"); ("\\", "l")
      ("/", "i"); ("/", "l")
      ("|", "i"); ("|", "l")
      ("!", "i"); ("!", "l")
      ("+", "t")
      ("@", "a")
      ("$", "s")
      ("&", "b")
      ("(", "c")
      ("[", "c")|]
    |> Array.map itemToConfusable

let confusables =
    obsfucationConfusables
    |> Array.append unicodeConfusables

let listToCodePoints = List.map (String.toCodePoints >> Seq.toArray)

let filters = ["nope"; "fail"; "leet"] |> listToCodePoints
let terms = ["ℕope"; "𝑵ope"; "ռope"; "nope"; "𝕱ail"; "𝓕ail"; "pass"; "𝕿rue"; "𝓽𝓻𝓾𝒆"; "l33t"; "1337"; "noope"; "failing"] |> listToCodePoints

let findCandidates codePoint = confusables |> Array.filter (fun x -> x.Original = codePoint)

let any = (<) 0

let rec getCombinations<'a> (input : 'a array array) : 'a array array =
    let currentArray = input |> Array.head
    let tail = input |> Array.tail
    if tail |> Array.length |> any then
        tail
        |> getCombinations
        |> Array.map (fun ia ->
            currentArray
            |> Array.map (fun ca ->
                [|[|ca|]; ia|]
                |> Array.flatten))
        |> Array.flatten
    else currentArray |> Array.map Array.initOne

let transformTerm =
    Array.map (fun codePoint ->
        match codePoint |> findCandidates with
        | [||] -> [|[|codePoint|]|]
        | candidates -> candidates |> Array.map (fun x -> x.Replacement) |> Array.append [|[|codePoint|]|])
    >> getCombinations
    >> Array.map Array.flatten

let lowerCaseTerm = Array.map (function | c when [('A' |> int)..('Z' |> int)] |> List.contains c -> c + 32 | c -> c)

//let transformToLowerTerm = transformTerm >> lowerCaseTerm
let transformToLowerTerm = transformTerm >> Array.map lowerCaseTerm
//let matchedFilters term = filters |> List.filter (term |> transformToLowerTerm |> (=))
//let matchedFilters term =
//    filters
//    |> List.filter (fun filter ->
//        let transformations = term |> transformToLowerTerm
//        let anyMatch = transformations |> Array.filter ((=) filter)
//        anyMatch |> Array.length |> any)

let matchFilters filters (term : int[]) =
    let matchFilter (filter : int[]) =
        let concated = term |> Array.append filter
        let grouped = concated |> Array.groupBy id |> Array.map (fun (c, a) -> (c, a |> Array.length))
        let sum = grouped |> Array.fold (fun acc (c, i) -> acc + i / 2) 0 |> float
        if sum <= 0.0 then None
        else (filter, sum / (max (term.Length |> float) (filter.Length |> float))) |> Some
    filters
    |> Seq.ofList
    |> Seq.choose matchFilter
let bestFilter filters =
    matchFilters filters >> Seq.sortByDescending snd >> Seq.tryHead
let meetsThreshold threshold (filter, percent) = percent > threshold

let matchedFilters term =
    term
    |> transformToLowerTerm
    |> Array.map (bestFilter filters >> Option.bindNone ([||], 0.0))
    |> Array.sortByDescending snd
    |> Array.filter (meetsThreshold 0.5)
    |> Array.tryHead
    |> (function | None -> [] | Some a -> [a])

let combinationTest = [|[|'g'|]; [|'r'|]; [|'e'; 'a'|]; [|'y'|]|] |> getCombinations |> Array.map String.implode

let matchedTerms =
    terms
    |> List.map (fun t -> (t, t |> matchedFilters))
    |> List.filter (snd >> List.length >> any)
    |> List.map (fun (t, f) -> (t |> String.fromCodePoints, f |> List.map (fun (f, s) -> (f |> String.fromCodePoints, s))))
//terms |> List.map (transformToLowerTerm >> String.fromCodePoints)
terms |> List.map (transformToLowerTerm >> Array.map String.fromCodePoints)

module Tests =
    module Assert =
        let private fn fn msg expected actual =
            if fn expected actual then None
            else (msg expected actual) |> Some
        let equal expected actual = fn (=) (sprintf "expected: %A; actual: %A") expected actual
        let notEqual expected actual = fn (<>) (sprintf "expected not: %A; actual: %A") expected actual
        let largerThan expected actual = fn (>) (sprintf "expected greater than: %A; actual %A") expected actual
        let largerEqual expected actual = fn (>=) (sprintf "expected greater than / equal to: %A; actual %A") expected actual
        let smallerThan expected actual = fn (<) (sprintf "expected smaller than: %A; actual %A") expected actual
        let smallerEqual expected actual = fn (<=) (sprintf "expected smaller than / equal to: %A; actual %A") expected actual
        let print = function | None -> printfn "Pass" | Some msg -> printfn "Fail: %s" msg

    let matchFilters filters (term : string) =
        let matchFilter (filter : string) =
            let termChars = term.ToCharArray()
            let filterChars = filter.ToCharArray()
            let concated = termChars |> Array.append filterChars
            let grouped = concated |> Array.groupBy id |> Array.map (fun (c, a) -> (c, a |> Array.length))
            let sum = grouped |> Array.fold (fun acc (c, i) -> acc + i / 2) 0 |> float |> (*) 2.0
            if sum <= 0.0 then None
            else (filter, sum / (term.Length + filter.Length |> float)) |> Some
        filters
        |> Seq.ofList
        |> Seq.choose matchFilter
    let bestFilter filters =
        matchFilters filters >> Seq.sortByDescending snd >> Seq.tryHead >> Option.map snd
    let meetsThreshold threshold (filter, percent) = percent > threshold

    let testFn filters = bestFilter filters >> Option.bindNone 0.0

    let ``A term that exactly matches a word in the filter should return 1.0 as match`` () =
        let expected = 1.0
        let inputTerm = "nope"
        let inputFilter = ["nope"]
        let actual = (inputFilter, inputTerm) ||> testFn
        Assert.equal expected actual

    let ``A term that does not match any words in the filter should return no matches`` () =
        let expected = 0.0
        let inputTerm = "abcd"
        let inputFilter = ["nope"]
        let actual = (inputFilter, inputTerm) ||> testFn
        Assert.equal expected actual

    let ``A term that partially matches a word in the filter should return 0.* as match`` () =
        let expected = 0.25
        let inputTerm = "true"
        let inputFilter = ["nope"]
        let actual = (inputFilter, inputTerm) ||> testFn
        Assert.equal expected actual

    let ``A term that mostly matches a word in the filter should return 0.* as match`` () =
        let expected = 0.75
        let inputTerm = "mope"
        let inputFilter = ["nope"]
        let actual = (inputFilter, inputTerm) ||> testFn
        Assert.equal expected actual

    let runTests () =
        let tests = [``A term that exactly matches a word in the filter should return 1.0 as match``
                     ``A term that does not match any words in the filter should return no matches``
                     ``A term that partially matches a word in the filter should return 0.* as match``
                     ``A term that mostly matches a word in the filter should return 0.* as match``]
        tests |> List.iter ((|>) () >> Assert.print)

Now, this file I've been working in is 194 lines or so, with several bits of commented code (no longer used, kept it because why not?), things that really should have been extracted away a while ago. Today, we're going to do that.

Because it's extremely important to do this (and on a regular basis), I'm going to go through the entire process, which I usually perform as the following steps:

  1. Identify related, stable bits of code that make sense together;
  2. Extract these to a new module/function/what-have-you;
  3. Test your implementation again, refactor anything necessary;
  4. Extract these to a new file in a .dll, build the .dll and reference it in the FSX (script) file;
  5. Test your implementation again, refactor anything necessary;
  6. Repeat for the next section of code;

By the time we get done we'll have this FSX file down to a few lines (40 or so), we're going to parameterize everything necessary, and build a new .dll solution.

Let's get started

So we'll begin with the easiest section: our Tests module. This is really already setup exactly how we want: a module with independent parts that we can import together or in whole.

To do this we'll create an F# Library project, I've added references to my F# Extensions project from GitHub, as we'll use some of the things in it.

Once we have the F# project created, we'll delete Library1.fs — it's unnecessary, we'll add a new .fs (F# Source File) and call it Tests, then finally, we'll cut and paste our module Tests code from our script. We only need to delete the equal sign after the word Tests and then everything is done for this part.

The other portions won't be so easy — we'll have to rewrite some of our code to work. Our next step is to take some of the Unicode stuff we did and move it to a new module. We want to define a module Unicode = in our current script file, and begin moving stuff in there, starting with type UnicodeConfusables.

By the time we finish with the Unicode stuff we should end up with something like:

module Unicode =
    type UnicodeConfusables = { Original : int; Replacement : int array; Field3 : string }

    let private file = "http://www.unicode.org/Public/security/10.0.0/confusables.txt"
    let private getData url =
        let file = url |> Option.bindNone file
        use wc = new WebClient()
        wc.DownloadString(file)

    let private mapToConfusable =
        function
        | [|a; b; c|] ->
            { Original = a |> String.trim |> Int.fromHex
              Replacement = b |> String.trim |> String.split ' ' |> Array.map Int.fromHex
              Field3 = c |> String.trim } |> Some
        | _ -> None

    let getConfusables url =
        getData url
        |> String.split '\n'
        |> Seq.filter (String.startsWith "#" >> not)
        |> Seq.map (Seq.takeWhile ((<>) '#') >> Seq.toArray >> String.implode >> String.split ';')
        |> Seq.choose mapToConfusable
        |> Seq.toArray

Now I've modified this to allow us to specify an alternate URL to download if we decide, which means we can use a new version of the confusables.txt if/when there is one, or we can cache it locally.

Next step: terms and filters

Whenever we have a bunch of functions that have the same prefix or suffix, there's a good chance they belong together. If we look at our code, we have transformTerm, lowerCaseTerm, and transformToLowerTerm. All three of these are transformations on a Term, so let's modularize them:

module Term =
    let transform =
        Array.map (fun codePoint ->
            match codePoint |> findCandidates with
            | [||] -> [|[|codePoint|]|]
            | candidates -> candidates |> Array.map (fun x -> x.Replacement) |> Array.append [|[|codePoint|]|])
        >> getCombinations
        >> Array.map Array.concat

    let lowerCase = Array.map (function | c when [('A' |> int)..('Z' |> int)] |> List.contains c -> c + 32 | c -> c)
    let transformToLower = transform >> Array.map lowerCase

Now we're getting somewhere. A lot of this stuff no longer needs touched, so it's easier on us if we just put it in a .fs file and push it out of our current working area.

Of course, now we realize we need the findCandidates function in the Term module, as well as getCombinations. So the next thing we'll do is move those to more usable locations.

The findCandidates function can belong in Term, as the module interacts with the confusables to find candidate confusable. It really doesn't belong in the Unicode module...or does it? As we look at it, we realize that findCandidates can totally belong in the Unicode module: it needs to know all about confusables, and it cares only about that.

It doesn't matter (much) where you put it, but I put it in my Term module. Often times you'll have several choices of what to do with something, and it's up to you to make the right choice.

Next we need getCombinations, which we built to be pretty generic. I'm actually going to put an almost identical version of this in my F# Extensions project (linked above), which you can use. You can also drop this into your own project if you like.

Filters, which are actually quite easy

Just like before we'll want to create a Filter module and drop the appropriate bits in there. Here's what you'll want to do:

  • Rename the functions to fit better;
  • Modify matchedFilters to take confusables, filters, and the threshold as parameters, in that order, then the term itself;

We do the second point because it makes the most sense, if term is the last parameter we can curry this very well:

let filter = Filter.matched confusables filters 0.5
let matchedTerms =
    terms
    |> List.map (fun t -> (t, t |> filter))
    |> List.filter (snd >> List.length >> any)
    |> List.map (fun (t, f) -> (t |> String.fromCodePoints, f |> List.map (fun (f, s) -> (f |> String.fromCodePoints, s))))

We set the confusables as the first parameter because they're likely to want to be swapped the least. Then, between threshold and filters, I can see changing threshold more often than filters.

Finally, we should be down to a 50-or-so line script:

#I "bin\\Release\\"
#r "FSharpExtensions.dll"
#r "FSharpExtensions.Applications.dll"

#load "Unicode.fs"
#load "Term.fs"
#load "Filter.fs"
#load "Tests.fs"

let listToCodePoints = List.map (String.toCodePoints >> Seq.toArray)

let obsfucationConfusables =
    let itemToConfusable (orig, repl) =
        { Unicode.UnicodeConfusables.Original = (orig, 0) ||> Char.toCodePoint
          Unicode.UnicodeConfusables.Replacement = repl |> String.toCodePoints |> Seq.toArray
          Unicode.UnicodeConfusables.Field3 = "OBS" }
    [|("1", "i"); ("1", "l")
      ("2", "z"); ("2", "s")
      ("3", "e")
      ("4", "a")
      ("5", "s"); ("5", "z")
      ("6", "g"); ("6", "b")
      ("7", "t")
      ("8", "b")
      ("9", "g")
      ("0", "o")
      ("\\", "i"); ("\\", "l")
      ("/", "i"); ("/", "l")
      ("|", "i"); ("|", "l")
      ("!", "i"); ("!", "l")
      ("+", "t")
      ("@", "a")
      ("$", "s")
      ("&", "b")
      ("(", "c")
      ("[", "c")|]
    |> Array.map itemToConfusable

let confusables = obsfucationConfusables |> Array.append (Unicode.getConfusables None)
let filters = ["nope"; "fail"; "leet"] |> listToCodePoints
let terms = ["ℕope"; "𝑵ope"; "ռope"; "nope"; "𝕱ail"; "𝓕ail"; "pass"; "𝕿rue"; "𝓽𝓻𝓾𝒆"; "l33t"; "1337"; "noope"; "failing"] |> listToCodePoints

let any = (<) 0

let combinationTest = [|[|'g'|]; [|'r'|]; [|'e'; 'a'|]; [|'y'|]|] |> Array.getCombinations |> Array.map String.implode

let filter = Filter.matched confusables filters 0.5

let matchedTerms =
    terms
    |> List.map (fun t -> (t, t |> filter))
    |> List.filter (snd >> List.length >> any)
    |> List.map (fun (t, f) -> (t |> String.fromCodePoints, f |> List.map (fun (f, s) -> (f |> String.fromCodePoints, s))))
terms |> List.map (Term.transformToLower confusables >> Array.map String.fromCodePoints)

What bothers me here are the obsfucationConfusables, we really ought to have a better way to do that, and as it turns out, we do.

This is where having url as a parameter in the Unicode.getConfusables function is very useful: we can build a file out for our obsfucationConfusables and map them with the same getConfusables function. We'll create a text file with the following content:

0031 ; 0069 ; OBS
0031 ; 006C ; OBS
0032 ; 007A ; OBS
0032 ; 0073 ; OBS
0033 ; 0065 ; OBS
0034 ; 0061 ; OBS
0035 ; 0073 ; OBS
0035 ; 007A ; OBS
0036 ; 0067 ; OBS
0036 ; 0062 ; OBS
0037 ; 0074 ; OBS
0038 ; 0062 ; OBS
0039 ; 0067 ; OBS
0030 ; 006F ; OBS
005C ; 0069 ; OBS
005C ; 006C ; OBS
002F ; 0069 ; OBS
002F ; 006C ; OBS
007C ; 0069 ; OBS
007C ; 006C ; OBS
0021 ; 0069 ; OBS
0021 ; 006C ; OBS
002B ; 0074 ; OBS
0040 ; 0061 ; OBS
0024 ; 0073 ; OBS
0026 ; 0062 ; OBS
0028 ; 0063 ; OBS
005B ; 0063 ; OBS

Finally, we make all this work in our new script file, and we should have something like the following:

#I "bin\\Release\\"
#r "FSharpExtensions.dll"
#r "FSharpExtensions.Applications.dll"

#load "Unicode.fs"
#load "Term.fs"
#load "Filter.fs"
#load "Tests.fs"

let listToCodePoints = List.map (String.toCodePoints >> Seq.toArray)

let confusables =
    __SOURCE_DIRECTORY__ + "\\ObsfucationConfusables.txt"
    |> Some
    |> Unicode.getConfusables
    |> Array.append (Unicode.getConfusables None)
let filters = ["nope"; "fail"; "leet"] |> listToCodePoints
let terms = ["ℕope"; "𝑵ope"; "ռope"; "nope"; "𝕱ail"; "𝓕ail"; "pass"; "𝕿rue"; "𝓽𝓻𝓾𝒆"; "l33t"; "1337"; "noope"; "failing"] |> listToCodePoints

let any = (<) 0
let filter = Filter.matched confusables filters 0.5

let matchedTerms =
    terms
    |> List.map (fun t -> (t, t |> filter))
    |> List.filter (snd >> List.length >> any)
    |> List.map (fun (t, f) -> (t |> String.fromCodePoints, f |> List.map (fun (f, s) -> (f |> String.fromCodePoints, s))))
terms |> List.map (Term.transformToLower confusables >> Array.map String.fromCodePoints)

Isn't that much cleaner? I think so. It also prepares us for making an actual program out of this, we know what points can be built out more, what needs parameterized further, and how to continue down our road.

Finally, fix the tests (homework)

At the moment our Tests module uses the functions inside it, and doesn't test our live implementations — we really out to fix that.

I leave the majority of the implementation up to you, but you'll want to start by rewriting testFn to work with Filter.best, and move from there. It's not hard, you only have to redefine testFn, and optionally define an additional function. I'll give you that one for free (since it's so simple):

let mapStr = String.toCodePoints >> Seq.toArray

This takes string -> int []. (To convert the sample term / filter into codepoints.)


So that was it — we gave ourselves a code-review and I obviously failed, but the new, corrected version should pass. Our next step will be to start modifying this to post an actual message. We want to make sure that we can take a message in and process it, match each word against a term, and then filter everything down to "Spam" or "Not Spam". This is a lot harder than it sounds, so be aware that it will not be an easy task. We'll also gradually expand our filter to support more features, and eventually have an enterprise-grade filter ready to use.

Getting started with programming and getting absolutely nowhere (Part 18)

Documentation: it's really important

Lesson 17: How close are two words?

I'm going to take this lesson to talk about the elephant in the room when it comes to programming: documentation.

I've said it before, and I'll say it again: documentation is really important. Whenever we write code we should always think of a way to document it, and we should try to do so as we write it. Don't make it an afterthought — too often businesses make documentation a thing that isn't done until everything else is done, this is bad. This creates a system where we not only have no idea how to use something, but we have to look at the source-code to understand it.

Recently, my friend Chris mentioned that he read the following statement from Google themselves in regards to Android:

Even the best developers frequently make mistakes in the way they interact with the Android system and with other applications

This is a very true aspect of Android development, and after thinking about it, I think we can blame the documentation.

Let's be realistic, there's hardly every a "bad system", there's mostly "bad usage" of the system, often encouraged by some sort of failure in the documentation:

  • Missing information on how to properly use an API;
  • A complete lack of examples on usage;
  • No explanation of how something works, simply examples;
  • Out-of-date documentation, such as in regard to Rev A, when we're on Rev L;
  • A lack of organization (a sane person expectes A.B to be under A, but it's actually under C);
  • Too much documentation, more a "word problem" than anything else;

I'm going to use the Android documentation as an example here, because it really falls into the latter category which is possibly the biggest issue you can have. As people, we take the path of least resistance (go figure, so does electrical energy, water flow, air flow, basically everything ever). If we need to go from A to B to C, we do that. If D is optional, and it's out of the way, we probably won't go. It's human nature, we are designed to take the easiest, quickest or most effective path.

When it comes to documentation there's a "sweet middle", you can have too little, and you can have too much. Both of those are dastardly: in the first case the software cannot be understood, in the second it probably won't be understood. As an example of how documentation harms, let's look at the Android API documentation.

There's no immediate call to action to the documentation

When you first hit this page it leaves you wanted to know how and where to go. I want to know how to get a location via GPS, how the hell do I navigate to it? For those who want the answer, it's:

  1. Click "API Guides" on the left bar (probably in a new spot by the time you read this)
  2. Location and Sensors
  3. Location Strategies

We have two problems here: first, I would not expect GPS information to be under "Location Strategies", I see "Position Sensors" and gravitate there. Second: this page gives you a plethora of information, too much information, in fact. I came looking for a sample of gathering the GPS location, what I got was a bunch of word-problems, and a massive block of code that, without context, really only tells us how to determine if a location is better or not. What we need to do is really read into "Deciding when to start listening for updates", as this tells us how to hook the GPS Location provider to our code.

It's been a while since I've dug into the Android documentation, but that took me 30 minutes to find. You know what's easier? Google: anrdoid get gps location, first result for me is Stack Overflow with a great, and brief, explanation. Realistically, Google could have given us a "cliff-notes" version that was equivalent to this Stack Overflow post. But they didn't — instead we had to put in a significant amount of effort to find it, and it's not in the spot we would expect. Too much documentation, and it's disorganized. Without reading paragraph after paragraph (many of which are details irrelevant to getting the location) we really won't get an answer from the documentation.

Documentation is always an afterthought

I've been in the industry a while, and I've never come across a situation where documentation was a priority. It's always an afterthought. Even in languages like F# where the majority of the hard work is done by a single /// comment, it's still an afterthought.

The problem with almost every single documentation system out there is that it's not written by the person who wrote the code. We often have technical writers (who often still have very good skills in programming) write the documentation — most of the time they're just as good at writing software as any developer, but they choose not to. They instead write documentation, the problem being they didn't write the code, so they're writing documentation based on what they can read of the code. Anyone who is a developer knows that reading even simple code written by another developer is often very difficult. It's a hard thing to do — and it's largely because it requires complex thought processes. I don't think the same way as you, and you don't think the same was as the technical writer, so by the time our code gets that far down the line no one really knows what it does, unless we commented or included documentation.

It usually takes 10-30 seconds to write documentation for a function, property, class, whatever. It's usually really quick, and it provides a plethora of knowledge afterwards. It really helps create a much more pleasant environment to work in.

If you overdo it (as in the case of that article), I'd suggest finding a better way to write it. You can reorganize it, and for the curious few include a "for more information on <thing>, click here." That simple — you keep it concise for the consuming developers, and don't leave out important information.

So what can you do to improve the situation?

Well I already mentioned writing documentation about whatever "thing" you created, document what the function does or what the property means. But it's more than that — if you see documentation out there that is bad, build a better version. Help improve, create the documentation you needed.

When desseminating documentation, it is our job to write clearly, concisely and effectively. Keep in mind that someone with zero domain knowledge will probably read it, but so will someone with excessive domain knowledge. Write your documentation for everyone.


I know this is more a rant than anything else, but it's a critical topic none-the-less. We need to build documentation clearly and effectively, and I don't see enough good documentation going around. Let's change that.

Getting started with programming and getting absolutely nowhere (Part 17)

How close are two words?

Lesson 16: Increasing the level of de-obsfucation

Today's lesson is going to continue on our "spam filter" adventure, but the next step it to start supporting similar-but-not-quite-identical words.

Anyone who's ever typed or written text has most assuredly spelled a word or two wrong, probably more than once. (In fact, I spell 'appropriate' wrong every day — for some reason I always end up with 'apporpriate', I think it's the speed at which I type.) These may be mistakes, or they may be intentional. Sometimes we spell something wrong for a reason.

Often times "spammers" will spell words wrong on-purpose — instead of just swapping letters out for other, similar letters, they'll actually mis-spell something knowing the filter is searching for the correctly spelled term. Of course, a good spam-filter (which is what we plan to build) can detect some of this and make decisions based on that fact itself.

As of right now we're testing several cases of our filters against the following terms: "nope"; "fail"; "leet", today we're going to add some more cases, such as "noope", and "failing". These are going to be two more words we match against, but we're also now going to return how effectively the filter matched the term. That is, "nope" would match against "nope" for a 100% match, but "noope" would not, so we'll devise a manner of indicating what the success-rate is.

We're also going to try to prevent some false-alarms. A word like "true" doesn't really match any of our filters, but it's fair to say that it shares letters with two of them: it shares one letter with "nope", and two letters with "leet". We know that it's not even close, but we'll have to tell the computer that before we continue.

This can also be the basis of a 'spell-checker'

At this moment we're devising a piece of code that could serve as a trivial spell-checker, we're going to have to design it that way because we have to support mis-spellings, and the easiest way to do so is to design it as a spell-checker.

We're also going to do some TDD here — for those unfamiliar, TDD is simply "test-driven development", or the idea that you build tests that have an input and expected output, and then start writing the code to get there. We would write a test such as:

let expected = 1.0
let inputTerm = "nope"
let inputFilter = ["nope"]
let actual = inputTerm |> matchFilters inputFilter |> Seq.tryHead |> Option.map snd |> (function | None -> 0.0 | Some v -> v)
printfn "%s" (if actual = expected then "Pass" else "Fail")

This uses the AAA pattern, or "Arrange, Act, Assert". That is:

  • Arrange: prepare the conditions for the test, this is the input and output. What are we looking for? What are we given?
  • Act: perform the operation for the test, this maps the input(s) to the output(s). What do we get?
  • Assert: determine whether the expected output and actual output match. What really happened?

When I do TDD I always write a test, then write the function being tested, and repeat. I want to start with the main "framework" that we'll use (though this is a basic testing framework, there are others that are far better than this, I recommend using one of them):

module Tests =
    module Assert =
        let private fn fn msg expected actual =
            if fn expected actual then None
            else (msg expected actual) |> Some
        let equal expected actual = fn (=) (sprintf "expected: %A; actual: %A") expected actual
        let notEqual expected actual = fn (<>) (sprintf "expected not: %A; actual: %A") expected actual
        let largerThan expected actual = fn (>) (sprintf "expected greater than: %A; actual %A") expected actual
        let largerEqual expected actual = fn (>=) (sprintf "expected greater than / equal to: %A; actual %A") expected actual
        let smallerThan expected actual = fn (<) (sprintf "expected smaller than: %A; actual %A") expected actual
        let smallerEqual expected actual = fn (<=) (sprintf "expected smaller than / equal to: %A; actual %A") expected actual
        let print assertion =
            match assertion with
            | None -> printfn "Pass"
            | Some msg -> printfn "Fail: %s" msg

    // Define tests

    let runTests () =
        let tests =
            [ // List tests
              ]
        tests |> List.iter ((|>) () >> Assert.print)

This is obviously not the easiest solution to use, but it works. Drop all your test methods in the tests list, and define them before runTests, and life should be good. As an example, our first test would look like:

let ``A term that exactly matches a word in the filter should return 1.0 as match`` () =
    let expected = 1.0
    let inputTerm = "nope"
    let inputFilter = ["nope"]
    let actual = inputTerm |> matchFilters inputFilter |> Seq.tryHead |> Option.map snd |> (function | None -> 0.0 | Some v -> v)
    Assert.equal expected actual

let runTests =
    let tests = [``A term that exactly matches a word in the filter should return 1.0 as match``]
    tests |> List.iter ((|>) () >> Assert.print)

Pretty easy, right?

Write the matchFilters function

Now we start writing the matchFilters function. We know what our inputs should be: a list of filters and a term to test, and we expect that it returns some sort of sequenced tuple. Specifically, we're going to return every term matched, and the "accuracy" of that match.

Initially, we can obviously define this pretty easily to pass our test:

let matchFilters filters term = filters |> List.choose (fun filter -> if term = filter then (filter, 1.0) |> Some else None)

If we run our tests now we should get Pass printed out. Pretty easy, right? Now we could have actually written let matchFilters filters term = ("", 1.0), and that would have been completely valid and satisfied our tests. It's not wrong. We still made all our tests pass. The version I wrote just happens to satisfy what the next test we'll write is as well.

Next we should probably test that a word like "abcd" doesn't accidentally return 1.0. If we write matchFilters as the version in the previous paragraph, it would do so, so we want a test for it:

let ``A term that does not match any words in the filter should return no matches`` () =
    let expected = 0.0
    let inputTerm = "abcd"
    let inputFilter = ["nope"]
    let actual = inputTerm |> matchFilters inputFilter |> Seq.tryHead |> Option.map snd |> (function | None -> 0.0 | Some v -> v)
    Assert.equal expected actual

Now we are forced to write a version that will at the very least return 1.0 for the first test, and 0.0 for this second one. You should now get the idea of TDD: write a test, build the method to make the test pass, repeat. Ideally, you don't trust a test until you've proven that the current version of the method made it change. That is: the test should fail first, then build a version of the method that passes. Think of it as a red light turning green: you can truly trust the traffic light once you've seen that happen, because you know the basic "is it just stuck on green?" test completed, and told you "nope, it's cycling like normal."

Now, interestingly, our second test isn't actually quite right, but I'm going to leave it as is. We said it should return no matches, but we tested it in a manner that loses that idea. It doesn't matter in the end, because no matches is synonymous with 0% match, but it's a good point to keep in mind.

Before we continue, let's make life a little easier on us. We should remember from previous lessons that our goal is always to write clear, concise, informative code. We can make matchFilters slightly better with that in mind:

let matchFilters filters term =
    let matchFilter filter =
        if term = filter then (filter, 1.0) |> Some
        else None
    filters
    |> Seq.ofList
    |> Seq.choose matchFilter

Now when we're ready to build better criteria, we just have to swap out matchFilter. Much easier than dealing with that silly lambda.

Build our spell-checking algorithm (sort-of)

Alright, on to the hard part. For the first iteration of the algorithm, we're simply going to test what letters are in both terms, then divide that by the total letters from both terms for the percentage. This will return 100% for "nope"/"nope", and 0% for "nope"/"abcd". This will also return a partial match for "nope"/"true" and "nope"/"leet".

Doing this is actually quite simple, we have a few options, the one I'll use is to create a concatenated array of both the strings, and then group them, and divide each group size by 2. As an example, consider we have "nope"/"true", when we concat we'll get [|'n'; 'o'; 'p'; 'e'; 't'; 'r'; 'u'; 'e'|], when we group the characters and get the counts we'll have [|('n', 1); ('o', 1); ('p', 1); ('e', 2); ('t', 1); ('r', 1); ('u', 1)|], if we sum floor(group.Count / 2) we get 1, which we then double and find that 2 of the total 8 characters were in each string, or 25%.

let matchFilters filters (term : string) =
    let matchFilter (filter : string) =
        let termChars = term.ToCharArray()
        let filterChars = filter.ToCharArray()
        let concated = termChars |> Array.append filterChars
        let grouped = concated |> Array.groupBy id |> Array.map (fun (c, a) -> (c, a |> Array.length))
        let sum = grouped |> Array.fold (fun acc (c, i) -> acc + i / 2) 0 |> float |> (*) 2.0
        if sum <= 0.0 then None
        else (filter, sum / (term.Length + filter.Length |> float)) |> Some
    filters
    |> Seq.ofList
    |> Seq.choose matchFilter
    |> Seq.sortByDescending snd

So that's what we end up with, for now. (We'll probably improve this in a later lesson.) We added the Seq.sortByDescending snd to order them by best match first. If we run our two existing tests we should get "Pass" and "Pass". We didn't create tests for this yet (well, I did, you probably haven't) so I recommend that before dumping this in your code you build a couple basic tests.

let testFn filters = matchFilters filters >> Seq.tryHead >> Option.map snd >> (function | None -> 0.0 | Some v -> v)

let ``A term that partially matches a word in the filter should return 0.* as match`` () =
    let expected = 0.25
    let inputTerm = "true"
    let inputFilter = ["nope"]
    let actual = (inputFilter, inputTerm) ||> testFn
    Assert.equal expected actual

let ``A term that mostly matches a word in the filter should return 0.* as match`` () =
    let expected = 0.75
    let inputTerm = "mope"
    let inputFilter = ["nope"]
    let actual = (inputFilter, inputTerm) ||> testFn
    Assert.equal expected actual

This method, of course, assumes that we only care about what characters are present and that the order is irrelevant. We'll (at some point) want to redefine it to support matching terms such that terms that have the same letters but not in the same position are not exactly a 1.0, etc. For now, this will do.

Finally, come up with a "threshold" for acceptance

When we do a "heuristic" match like this, we need to define a threshold of tolerance. We don't want "true" to match the "nope" filter because it only shares an e, we wnat to define a minimum-level-of-acceptability. For this, I usually start with a somewhat middle-ground value, knowing that 0.5 means 50% of our words were a match, I go with 0.75, or 75%. This gives us an initial feeling for where we stand, and we can tune that later.

Now I'm going to redefine things a little, don't fear, just to keep it a little cleaner:

let matchFilters filters (term : string) =
    let matchFilter (filter : string) =
        let termChars = term.ToCharArray()
        let filterChars = filter.ToCharArray()
        let concated = termChars |> Array.append filterChars
        let grouped = concated |> Array.groupBy id |> Array.map (fun (c, a) -> (c, a |> Array.length))
        let sum = grouped |> Array.fold (fun acc (c, i) -> acc + i / 2) 0 |> float |> (*) 2.0
        if sum <= 0.0 then None
        else (filter, sum / (term.Length + filter.Length |> float)) |> Some
    filters
    |> Seq.ofList
    |> Seq.choose matchFilter
let bestFilter filters =
    matchFilters filters >> Seq.sortByDescending snd >> Seq.tryHead >> Option.map snd

Pretty simple, we just moved our Seq.sortByDescending out, and now we'll go ahead and add a function for meetsThreshold which takes our "filter" and threshold:

let meetsThreshold threshold (filter, percent) = percent > threshold

Time to modify our previous matchedFilters

Finally, the fun part. We'll modify the matchedFilters to use our new algorithm. I'm only going to pick the best-matched filter, and we're only going to pick which transformation of the term it was that best-matched the filter, so we'll end up with the following:

let matchedFilters term =
    term
    |> transformToLowerTerm
    |> Array.map (bestFilter filters >> (function | None -> ([||], 0.0) | Some f -> f))
    |> Array.sortByDescending snd
    |> Array.filter (meetsThreshold 0.75)
    |> Array.tryHead
    |> (function | None -> [] | Some a -> [a])

I did make one modification to matchFIlters: the inner-function was defined to use strings, we want to use our int-arrays, so we'll change:

let matchFilters filters (term : string) =
    let matchFilter (filter : string) =
        let termChars = term.ToCharArray()
        let filterChars = filter.ToCharArray()
        let concated = termChars |> Array.append filterChars

To:

let matchFilters filters (term : int[]) =
    let matchFilter (filter : int[]) =
        let concated = term |> Array.append filter

And we should have the new algorithm. We should get some matched terms:

val matchedTerms : (string * string list) list =
  [("ℕope", ["nope"]); ("𝑵ope", ["nope"]); ("ռope", ["nope"]);
   ("nope", ["nope"]); ("𝕱ail", ["fail"]); ("𝓕ail", ["fail"]);
   ("l33t", ["leet"]); ("1337", ["leet"]); ("noope", ["nope"])]

We notice that "failing" is not in there - we can add it by tuning that 0.75 threshold from before. And, finally, if you want to include the percent that it matched the filter in your matchedTerms, replace it with the following:

let matchedTerms =
    terms
    |> List.map (fun t -> (t, t |> matchedFilters))
    |> List.filter (snd >> List.length >> any)
    |> List.map (fun (t, f) -> (t |> String.fromCodePoints, f |> List.map (fun (f, s) -> (f |> String.fromCodePoints, s))))

We just change the fst >> String.fromCodePoints to a lambda, which maps the first item to the string, and the second item as the value still.


Alright, so that sums up todays lesson. I know it got a bit rambly towards the end but that's what happens when you have absolutely no sleep and write this over a 5-day period. I actually have a plan for the next one, which involves more spreadsheet stuff, so tune-in next time for another fabulous adventure in learning how to program (but the hard way). Good luck and enjoy!

Getting started with programming and getting absolutely nowhere (Part 16)

Increasing the level of de-obsfucation

Lesson 15: Building a list of several candidates

Today the lesson will be short, I've been swamped at work but I told myself that at a minimum I wanted a post each week. As a result, we'll talk about a lot of basic things, but in reality we're just going to build on what we did in lesson 15. We took the entire Unicode confusables map in, now we're going to add some of our own.

Let's revisit some history

When the internet started taking shape, and technology became more prevalent, we started coming up with shorter ways to type things. The advent of chatrooms, instant messengers, and text messaging all influenced us to design a form of text that is similar but not the same as normal, standard text. We began finding ways to replace letters with numbers, (such as 0 for o), and finding ways to abbreviate words/phrases/terms to make them easier to type.

One of these many changes to our general text-processing was the advent of something known commonly as "leet speak", often written "l33t." This is the idea of replacing letters with numbers that look similar, often seen in online-gaming communities. For example, we would use a 3 instead of e, a 4 for an a, a 5 for s, a 7 for T. We could even use a 1 for an i or l — the reader is usually capable of distinguishing which was intended. With all that said, we can come up with 1337, which when properly replaced would be our term "leet".

This adds a precarious challenge to our spam-filtering: robots can now replace the letters with a similar number, the reader will still interpret it properly, but now it bypasses the filters (because adv3nt is not the same as advent). We need a way to distinguish when this is happening.

Build on the existing foundation

Well, we already have the foundation for that. We have the confusables map, and we can easily-enough add new items to this map. In fact, we saw an example of it previously:

let confusables2 =
    data
    |> String.split '\n'
    |> Seq.filter (String.startsWith "#" >> not)
    |> Seq.map (Seq.takeWhile ((<>) '#') >> Seq.toArray >> String.implode >> String.split ';')
    |> Seq.choose mapToConfusable
    |> Seq.toArray

let confusables =
    [|{ Original = ("ℕ", 0) ||> Char.toCodePoint; Replacement = "m" |> String.toCodePoints |> Seq.toArray; Field3 = "" }|]
    |> Array.append confusables2

Now, confusables2 is not a good name, and we really should make adding new ones a bit easier, so let's define a new obsfucationConfusables that has our new confusables:

let obsfucationConfusables =
    let itemToConfusable (orig, repl) =
        { Original = (orig, 0) ||> Char.toCodePoint
          Replacement = repl |> String.toCodePoints |> Seq.toArray
          Field3 = "OBS" }
    [|("1", "i"); ("1", "l")
      ("2", "z"); ("2", "s")
      ("3", "e")
      ("4", "a")
      ("5", "s"); ("5", "z")
      ("6", "g"); ("6", "b")
      ("7", "t")
      ("8", "b")
      ("9", "g")
      ("0", "o")
      ("\\", "i"); ("\\", "l")
      ("/", "i"); ("/", "l")
      ("|", "i"); ("|", "l")
      ("!", "i"); ("!", "l")
      ("+", "t")
      ("@", "a")
      ("$", "s")
      ("&", "b")
      ("(", "c")
      ("[", "c")|]
    |> Array.map itemToConfusable

This was actually really easy, we could even load this from another text-file if we'd like. Because we built with foresight, we don't have to worry too much about how to inject these new values. We can map them directly to our previous type, and build out our confusables as a simple array append (as done before):

let confusables =
    obsfucationConfusables
    |> Array.append unicodeConfusables

We're appending obsfucationConfusables after our unicodeConfusables, so we'll be adding them to the replacements. This means that adding a new test is easy:

let filters = ["nope"; "fail"; "leet"] |> listToCodePoints
let terms = ["ℕope"; "𝑵ope"; "ռope"; "nope"; "𝕱ail"; "𝓕ail"; "pass"; "𝕿rue"; "𝓽𝓻𝓾𝒆"; "l33t"; "1337"] |> listToCodePoints

Done. That's all we need to do, now l33t and 1337 will be matched to leet. If we map out all the combinations of terms, F# will happily give us the following:

possibilitiesForL33t = [|"l33t"; "le3t"; "l3et"; "leet"|]
possibilitiesFor1337 = [|"1337"; "l337"; "i337"; "l337"; "1e37"; "le37"; "ie37"; "le37"; "13e7"; "l3e7"; "i3e7"; "l3e7"; "1ee7"; "lee7"; "iee7"; "lee7"; "133t"; "l33t"; "i33t"; "l33t"; "1e3t"; "le3t"; "ie3t"; "le3t"; "13et"; "l3et"; "i3et"; "l3et"; "1eet"; "leet"; "ieet"; "leet"|]]

Right, so we've established that this worked properly before, now it works still but we added a whole new featur and capability. The next step will be to take an input string, and process each word in it.


I know this was short and boring, but it's about the best I could do this week. I've been seriously swamped at work, it's been eating into my personal time just a bit. With that said, this should get us moving towards the path to success, and we'll get to some really fun data-analysis as we progress further. I'm going in to 'hardware mode' this weekend, so I should be right-properly refreshed by Monday.

Getting started with programming and getting absolutely nowhere (Part 15)

Building a list of several candidates

Lesson 14: Starting our next problem

In the previous lesson, I mentioned the following:

One of the biggest issues, that might not be immediately apparent, with how we do this is: what happens if we have multiple confusables for a character? We only transform the first one, which means we could be missing out on some possible conversions. We'll go over fixing this in the next lesson, but I want you to start thinking about it now. Essentially, our goal will be to swap out the transformTerm with a version that returns every possible option (which for each of our samples is still only 1).

If it's not clear, what I mean by this is that if a character we're given could be confused with two different characters, then we want to consider both possibilities. We want to be able to test all variants, to ensure we don't miss any.

For this, we need a way to gather all the variants. Here we'll go back to a quick REPL integration, to begin our development. We want a function, called getCombinations, that allows us to provide an array of arrays, and returns another array of arrays of each possible combination of those positions. So, if we provided [|[|"g"|]; [|"r"|]; [|"e"; "a"|]; [|"y"|]|], we want to get back [|[|"g"; "r"; "e"; "y"|]; [|"g"; "r"; "a"; "y"|]|], thus, we should have the following:

// This should be [|"grey"; "gray"|]
let combinationTest = [|[|'g'|]; [|'r'|]; [|'e'; 'a'|]; [|'y'|]|] |> getCombinations |> Array.map String.implode

Make sense? So given an array of options for a position, return all possible combinations of those options. If we provide [|[|'w'|]; [|'o'|]; [|'r'|]; [|'d'|]|], get [|[|'w'; 'o'; 'r'; 'd'|]|].

We're going to do this in a fairly simple manner, using recursion (but not tail-call recursion, in our example). Now I've done the hard part, and written the getCombinations function, in this lesson we'll focus on modifying our current program to use it.

Getting all the combinations

This function is somewhat simple, and lacks tail-call recursion, so a good exercise for you might be to rewrite it for tail-call recursion. (It can be done, and I have done it with this just to prove it, we'll discuss that in a different non-lesson.)

let rec getCombinations<'a> (input : 'a array array) : 'a array array =
    let currentArray = input |> Array.head
    let tail = input |> Array.tail
    if tail |> Array.length |> any then
        tail
        |> getCombinations
        |> Array.map (fun ia ->
            currentArray
            |> Array.map (fun ca ->
                [|[|ca|]; ia|]
                |> Array.flatten))
        |> Array.flatten
    else currentArray |> Array.map Array.initOne

So, we basically get the sub-combinations, and combine them with the combinations of the option we currently have. Pretty simple, right?

Now we can test this with the previous tests, and prove it to ourselves, but I leave that up to you. (I already proved it to myself, it's your turn to prove that it works like we want.) We use the 'generic type' specification here, as it should really work on anything. So, your signature should be 'a array array -> 'a array array.

Next, we need to pull this to our transform

Previously, we defined transformTerm as follows:

let transformTerm =
    Array.map (fun codePoint ->
        match codePoint |> findCandidates with
        | [||] -> [|codePoint|]
        | candidates -> candidates.[0].Replacement)
    >> Array.flatten

We're going to slightly update this defnition. The first step is to modify candidates to return all the replacements: candidates |> Array.map (fun x -> x.Replacement)). Next, we need to modify the [||] (empty) match-case to return [|[|codePoint|]|]. Then, we need to add a call to getCombinations at the end. Finally, move Array.flatten to the end, in an Array.map.

let transformTerm =
    Array.map (fun codePoint ->
        match codePoint |> findCandidates with
        | [||] -> [|[|codePoint|]|]
        | candidates -> candidates |> Array.map (fun x -> x.Replacement))
    >> getCombinations
    >> Array.map Array.flatten

This, of course, breaks our transformToLowerTerm and matchedFilters. For transformToLowerTerm we just need to Array.map lowerCaseTerm instead, and the matchedFilters will be a bit more difficult. Our previous version was pretty simple:

let matchedFilters term = filters |> List.filter (term |> transformToLowerTerm |> (=))

Here, we have to think about what needs done now. We have multiple terms being matched to multiple filters, as such we have a significant rewrite to do, but only on the List.filter portion. We'll modify the filter to filter each term against the current filter, and then test that we got any matches. If we did, any form of the term matched our filter.

let matchedFilters term =
    filters
    |> List.filter (fun filter ->
        let transformations = term |> transformToLowerTerm
        let anyMatch = transformations |> Array.filter ((=) filter)
        anyMatch |> Array.length |> any)

That should be all we need to modify, we can test our transformToLowerTerm with the following:

terms |> List.map (transformToLowerTerm >> Array.map String.fromCodePoints)

Small bugs begin to appear

We do happen to have a bug in this, and I'll show you what it is quick.

In the case that we add a confusable, we always ignore the original character. This can be problematic if we add confusables for the regular ASCII characters, such as o, p, or e. To fix this, we simply add Array.append [|[|codePoint|]|] after our Array.map (fun x -> x.Replacement). This will include the original character as well.

We can test all this out on our real data-set, by changing our confusables and filters:

let confusables2 =
    data
    |> String.split '\n'
    |> Seq.filter (String.startsWith "#" >> not)
    |> Seq.map (Seq.takeWhile ((<>) '#') >> Seq.toArray >> String.implode >> String.split ';')
    |> Seq.choose mapToConfusable
    |> Seq.toArray

let confusables =
    [|{ Original = ("ℕ", 0) ||> Char.toCodePoint; Replacement = "m" |> String.toCodePoints |> Seq.toArray; Field3 = "" }|]
    |> Array.append confusables2

let listToCodePoints = List.map (String.toCodePoints >> Seq.toArray)

let filters = ["mope"; "fail"] |> listToCodePoints
let terms = ["ℕope"; "𝑵ope"; "ռope"; "nope"; "𝕱ail"; "𝓕ail"; "pass"; "𝕿rue"; "𝓽𝓻𝓾𝒆"] |> listToCodePoints

This should result in matchedTerms being val matchedTerms : string list = ["ℕope"; "𝕱ail"; "𝓕ail"] instead of the whole list, if we change mope to nope we still get the first "ℕope", which proves everything works properly.

Great work, team!

Overall, we should have learned that if we structure our code well, we can add new functionality without having to do too much modification. We can easily add a whole new feature without creating much of a problem at all.


The entire goal of this lesson is to show how we can add substantial functionality to our software without doing major rewrites. It also helps demonstrate how we should be thinking about problems as we solve them. Work from a small issue up to the larger one, and build independent stages to allow easy troubleshooting, testing and integration of them. This wasn't a long lesson, but it's just as important as any other. (I've been swamped at work all week thanks to the ball being dropped by an API we consume, so haven't had a lot of time to get a good post together. Hopefully this meets everyone's expectations.)