diff --git a/src/ClassicalCiphers.jl b/src/ClassicalCiphers.jl index 624be99..d01860d 100644 --- a/src/ClassicalCiphers.jl +++ b/src/ClassicalCiphers.jl @@ -8,7 +8,7 @@ include("caesar.jl") include("vigenere.jl") include("solitaire.jl") -export encrypt_monoalphabetic, decrypt_monoalphabetic, +export encrypt_monoalphabetic, decrypt_monoalphabetic, crack_monoalphabetic encrypt_caesar, decrypt_caesar, encrypt_vigenere, decrypt_vigenere, encrypt_solitaire, decrypt_solitaire, diff --git a/src/common.jl b/src/common.jl index b086dba..de5c0c8 100644 --- a/src/common.jl +++ b/src/common.jl @@ -64,13 +64,35 @@ Performs a trigram analysis on the input string, to determine how close it is to English. That is, splits the input string into groups of three letters, and assigns a score based on the frequency of the trigrams in true English. """ -function string_fitness(input) - str = uppercase(letters_only(input)) +function string_fitness(input; alreadystripped=false) + if !alreadystripped + str = letters_only(input) + else + str = input + end + + str = uppercase(str) ans = 0 for i in 1:(length(str)-2) ans += get(trigram_fitnesses, str[i:i+2], 0) end - ans + log(ans/length(str)) end + +""" +Finds the frequencies of all characters in the input string, returning a Dict +of 'a' => 4, for instance. Uppercase characters are considered distinct from lowercase. +""" +function frequencies(input) + ans = Dict{Char, Integer}() + for i in input + if haskey(ans, i) + ans[i] += 1 + else + ans[i] = 0 + end + end + ans +end \ No newline at end of file diff --git a/src/monoalphabetic.jl b/src/monoalphabetic.jl index 32e6880..1af80ce 100644 --- a/src/monoalphabetic.jl +++ b/src/monoalphabetic.jl @@ -47,3 +47,125 @@ function decrypt_monoalphabetic(ciphertext, key::AbstractString) dict = [(a => Char(96 + search(lowercase(key), a))) for a in lowercase(key)] encrypt_monoalphabetic(lowercase(ciphertext), dict) end + +# Cracking + +# The method we use for cracking is simulated annealing. + +""" +swap_two(string) swaps two of the characters of the input string, at random. +The characters are guaranteed to be at different positions, though "aa" would be +'swapped' to "aa". +""" +function swap_two(str) + indices = rand(1:length(str), 2) + while indices[1] == indices[2] + indices = rand(1:length(str), 2) + end + + join([i == indices[1] ? str[indices[2]] : (i == indices[2] ? str[indices[1]] : str[i]) for (i, ch) in enumerate(str)], "") +end + +""" +crack_monoalphabetic cracks the given ciphertext which was encrypted by the monoalphabetic +substitution cipher. +Possible arguments include: +starting_key="", which when specified (for example, as "ABCDEFGHIJKLMNOPQRSTUVWXYZ"), + starts the simulation at the given key. The default causes it to start with the most + common characters being decrypted to the most common English characters. +min_temp=0.0001, which is the temperature at which we stop the simulation. +temp_factor=0.97, which is the factor by which the temperature decreases each step. +chatty=0, which can be set to 1 to print whenever the key is updated, or 2 to print + whenever any new key is considered. +rounds=1, which sets the number of repetitions we perform. Each round starts with the + best key we've found so far. +acceptance_prob=((e, ep, t) -> ep>e ? 1 : exp(-(e-ep)/t)), which is the probability + with which we accept new key of fitness ep, given that the current key has fitness e, + at temperature t. +""" +function crack_monoalphabetic(ciphertext; starting_key="", + min_temp=0.0001, temp_factor=0.97, + acceptance_prob=((e,ep,t) -> ep > e ? 1. : exp(-(e-ep)/t)), + chatty=0, + rounds=1) + + if starting_key == "" + # most common letters + commonest = "ETAOINSHRDLUMCYWFGBPVKZJXQ" + freqs = frequencies(uppercase(letters_only(ciphertext))) + for c in 'A':'Z' + if !haskey(freqs, c) + freqs[c] = 0 + end + end + + freqs_input = sort(collect(freqs), by = tuple -> last(tuple), rev=true) + start_key = ['a' for c in 1:26] + for i in 1:26 + start_key[Int(commonest[i])-64] = freqs_input[i][1] + end + + key = join(start_key, "") + else + key = starting_key + end + + if chatty > 1 + println("Starting key: $(key)") + end + + stripped_ciphertext = letters_only(ciphertext) + fitness = string_fitness(decrypt_monoalphabetic(stripped_ciphertext, key)) + total_best_fitness = fitness + total_best_key = key + total_best_decrypt = decrypt_monoalphabetic(ciphertext, key) + + for roundcount in 1:rounds + temp = 10^((roundcount-1)/rounds) + while temp > min_temp + for i in 1:round(Int, min(ceil(1/temp), 10)) + neighbour = swap_two(key) + new_fitness = string_fitness(decrypt_monoalphabetic(stripped_ciphertext, neighbour), alreadystripped=true) + if new_fitness > total_best_fitness + total_best_fitness = new_fitness + total_best_key = neighbour + total_best_decrypt = decrypt_monoalphabetic(ciphertext, total_best_key) + end + + threshold = rand() + + if chatty >= 2 + println("Current fitness: $(fitness)") + println("New fitness: $(new_fitness)") + println("Acceptance probability: $(acceptance_prob(fitness, new_fitness, temp))") + println("Threshold: $(threshold)") + end + + if acceptance_prob(fitness, new_fitness, temp) >= threshold + if chatty >= 1 + println("$(key) -> $(neighbour), threshold $(threshold), temperature $(temp), fitness $(new_fitness), prob $(acceptance_prob(fitness, new_fitness, temp))") + end + fitness = new_fitness + key = neighbour + end + end + + temp = temp * temp_factor + + if chatty >= 2 + println("----") + end + end + + key = total_best_key + fitness = total_best_fitness + temp = 1 + end + + if chatty >= 1 + println("Best was $(total_best_key) at $(total_best_fitness)") + println(total_best_decrypt) + end + (key, decrypt_monoalphabetic(ciphertext, key)) +end +