Describe the network (#31 )

One-hot encoding (#30 )
Iris data (#29 )
2023-06-17 23:03:32 +01:00 · 2023-06-17 22:14:37 +01:00 · 2023-06-17 19:03:01 +01:00 · 2023-06-17 18:03:24 +01:00 · 2023-06-17 15:46:19 +01:00 · 2023-06-14 15:58:26 +01:00
29 changed files with 3112 additions and 988 deletions
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -21,7 +21,7 @@ jobs:
        {
          "name": "Install Nix",
          "uses": "cachix/install-nix-action@v17",
-          "with": { "extra-nix-config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
+          "with": { "extra_nix_config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
        },
        {
          "name": "cargo test",
@@ -41,7 +41,7 @@ jobs:
        {
          "name": "Install Nix",
          "uses": "cachix/install-nix-action@v17",
-          "with": { "extra-nix-config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
+          "with": { "extra_nix_config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
        },
        {
          "name": "cargo test (release)",
@@ -62,7 +62,7 @@ jobs:
    #        {
    #          "name": "Install Nix",
    #          "uses": "cachix/install-nix-action@v17",
-    #          "with": { "extra-nix-config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
+    #          "with": { "extra_nix_config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
    #        },
    #        {
    #          "name": "Run Shellcheck",
@@ -83,7 +83,7 @@ jobs:
        {
          "name": "Install Nix",
          "uses": "cachix/install-nix-action@v17",
-          "with": { "extra-nix-config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
+          "with": { "extra_nix_config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
        },
        {
          "name": "Run Alejandra",
@@ -104,7 +104,7 @@ jobs:
        {
          "name": "Install Nix",
          "uses": "cachix/install-nix-action@v17",
-          "with": { "extra-nix-config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
+          "with": { "extra_nix_config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
        },
        {
          "name": "Run ShellCheck",
@@ -125,11 +125,11 @@ jobs:
        {
          "name": "Install Nix",
          "uses": "cachix/install-nix-action@v17",
-          "with": { "extra-nix-config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
+          "with": { "extra_nix_config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
        },
        {
          "name": "Run Clippy",
-          "run": "nix develop --command cargo -- clippy -- -D warnings"
+          "run": "nix develop --command cargo -- clippy -- -D warnings -W clippy::must_use_candidate"
        }
      ]
    }
@@ -146,7 +146,7 @@ jobs:
        {
          "name": "Install Nix",
          "uses": "cachix/install-nix-action@v17",
-          "with": { "extra-nix-config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
+          "with": { "extra_nix_config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
        },
        {
          "name": "Build app",
@@ -167,7 +167,7 @@ jobs:
        {
          "name": "Install Nix",
          "uses": "cachix/install-nix-action@v17",
-          "with": { "extra-nix-config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
+          "with": { "extra_nix_config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
        },
        {
          "name": "Run app",
@@ -188,7 +188,7 @@ jobs:
        {
          "name": "Install Nix",
          "uses": "cachix/install-nix-action@v17",
-          "with": { "extra-nix-config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
+          "with": { "extra_nix_config": "access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}" }
        },
        {
          "name": "Run link checker",
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ target/
 *.iml
 .vscode/
 .profile*
+.DS_Store
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -26,12 +26,50 @@ dependencies = [
 "wyz",
 ]

+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "csv"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "626ae34994d3d8d668f4269922248239db4ae42d538b14c398b74a52208e8086"
+dependencies = [
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "funty"
 version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"

+[[package]]
+name = "getrandom"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
 [[package]]
 name = "immutable-chunkmap"
 version = "1.0.5"
@@ -43,23 +81,44 @@ dependencies = [
 "packed_struct_codegen",
 ]

+[[package]]
+name = "itoa"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
+
+[[package]]
+name = "libc"
+version = "0.2.142"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a987beff54b60ffa6d51982e1aa1146bc42f19bd26be28b0586f252fccf5317"
+
 [[package]]
 name = "little_learner"
 version = "0.1.0"
 dependencies = [
 "immutable-chunkmap",
 "ordered-float",
+ "rand",
 ]

 [[package]]
 name = "little_learner_app"
 version = "0.1.0"
 dependencies = [
+ "csv",
 "immutable-chunkmap",
 "little_learner",
 "ordered-float",
+ "rand",
 ]

+[[package]]
+name = "memchr"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+
 [[package]]
 name = "num-traits"
 version = "0.2.15"
@@ -99,6 +158,12 @@ dependencies = [
 "syn",
 ]

+[[package]]
+name = "ppv-lite86"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.56"
@@ -123,6 +188,48 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"

+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "ryu"
+version = "1.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
+
+[[package]]
+name = "serde"
+version = "1.0.164"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d"
+
 [[package]]
 name = "syn"
 version = "1.0.109"
@@ -146,6 +253,12 @@ version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"

+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
 [[package]]
 name = "wyz"
 version = "0.5.1"
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
 # The Little Learner, in Rust

+[![Rust](https://github.com/Smaug123/little_learner/actions/workflows/rust.yml/badge.svg)](https://github.com/Smaug123/little_learner/actions/workflows/rust.yml)
+
 Me running through [The Little Learner](https://www.thelittlelearner.com/), but in Rust instead of Scheme.
--- a/little_learner/Cargo.toml
+++ b/little_learner/Cargo.toml
@@ -8,5 +8,6 @@ edition = "2021"
 [dependencies]
 immutable-chunkmap = "1.0.5"
 ordered-float = "3.6.0"
+rand = "0.8.5"

 [lib]
--- a/little_learner/src/auto_diff.rs
+++ b/little_learner/src/auto_diff.rs
--- a/little_learner/src/block.rs
+++ b/little_learner/src/block.rs
@@ -0,0 +1,63 @@
+use crate::auto_diff::{Differentiable, RankedDifferentiable, RankedDifferentiableTagged};
+use crate::ext::relu;
+use crate::traits::NumLike;
+
+pub struct Block<F, const N: usize> {
+    f: F,
+    ranks: [usize; N],
+}
+
+/// Does the second argument first, so compose(b1, b2) performs b2 on its input, and then b1.
+pub fn compose<'a, 'c, 'd, A, T, B, C, F, G, const N: usize, const M: usize>(
+    b1: Block<F, N>,
+    b2: Block<G, M>,
+    j: usize,
+) -> Block<impl FnOnce(&'a A, &'d [T]) -> C, { N + M }>
+where
+    F: FnOnce(&'a A, &'d [T]) -> B,
+    G: for<'b> FnOnce(&'b B, &'d [T]) -> C,
+    A: 'a,
+    T: 'd,
+{
+    let mut ranks = [0usize; N + M];
+    ranks[..N].copy_from_slice(&b1.ranks);
+    ranks[N..(M + N)].copy_from_slice(&b2.ranks);
+    Block {
+        f: move |t, theta| {
+            let intermediate = (b1.f)(t, theta);
+            (b2.f)(&intermediate, &theta[j..])
+        },
+        ranks,
+    }
+}
+
+#[must_use]
+pub fn dense<'b, A, Tag>(
+    input_len: usize,
+    neuron_count: usize,
+) -> Block<
+    impl for<'a> FnOnce(
+        &'a RankedDifferentiableTagged<A, Tag, 1>,
+        &'b [Differentiable<A>],
+    ) -> RankedDifferentiable<A, 1>,
+    2,
+>
+where
+    Tag: Clone,
+    A: NumLike + PartialOrd + Default,
+{
+    Block {
+        f: for<'a> |t: &'a RankedDifferentiableTagged<A, Tag, 1>,
+                    theta: &'b [Differentiable<A>]|
+                 -> RankedDifferentiable<A, 1> {
+            relu(
+                t,
+                &(theta[0].clone().attach_rank().unwrap()),
+                &(theta[1].clone().attach_rank().unwrap()),
+            )
+            .attach_rank()
+            .unwrap()
+        },
+        ranks: [input_len, neuron_count],
+    }
+}
--- a/little_learner/src/const_teq.rs
+++ b/little_learner/src/const_teq.rs
@@ -1,13 +0,0 @@
-use std::marker::PhantomData;
-
-pub struct ConstTeq<const A: usize, const B: usize> {
-    phantom_a: PhantomData<[(); A]>,
-    phantom_b: PhantomData<[(); B]>,
-}
-
-pub fn make<const A: usize>() -> ConstTeq<A, A> {
-    ConstTeq {
-        phantom_a: Default::default(),
-        phantom_b: Default::default(),
-    }
-}
--- a/little_learner/src/decider.rs
+++ b/little_learner/src/decider.rs
@@ -0,0 +1,68 @@
+use crate::auto_diff::RankedDifferentiableTagged;
+use crate::loss::dot;
+use crate::scalar::Scalar;
+use crate::traits::{NumLike, Zero};
+
+pub(crate) fn rectify<A>(x: A) -> A
+where
+    A: Zero + PartialOrd,
+{
+    if x < A::zero() {
+        A::zero()
+    } else {
+        x
+    }
+}
+
+fn linear<A, Tag1, Tag2>(
+    t: &RankedDifferentiableTagged<A, Tag1, 1>,
+    theta0: &RankedDifferentiableTagged<A, Tag2, 1>,
+    theta1: Scalar<A>,
+) -> Scalar<A>
+where
+    A: NumLike,
+{
+    dot(theta0, t) + theta1
+}
+
+pub fn relu<A, Tag1, Tag2>(
+    t: &RankedDifferentiableTagged<A, Tag1, 1>,
+    theta0: &RankedDifferentiableTagged<A, Tag2, 1>,
+    theta1: Scalar<A>,
+) -> Scalar<A>
+where
+    A: NumLike + PartialOrd,
+{
+    rectify(linear(t, theta0, theta1))
+}
+
+#[cfg(test)]
+mod test_decider {
+    use crate::auto_diff::RankedDifferentiable;
+    use crate::decider::{linear, relu};
+    use crate::not_nan::to_not_nan_1;
+    use crate::scalar::Scalar;
+    use ordered_float::NotNan;
+
+    #[test]
+    fn test_linear() {
+        let theta0 = RankedDifferentiable::of_slice(&to_not_nan_1([7.1, 4.3, -6.4]));
+        let theta1 = Scalar::make(NotNan::new(0.6).expect("not nan"));
+        let t = RankedDifferentiable::of_slice(&to_not_nan_1([2.0, 1.0, 3.0]));
+
+        let result = linear(&t, &theta0, theta1).real_part().into_inner();
+
+        assert!((result + 0.1).abs() < 0.000_000_01);
+    }
+
+    #[test]
+    fn test_relu() {
+        let theta0 = RankedDifferentiable::of_slice(&to_not_nan_1([7.1, 4.3, -6.4]));
+        let theta1 = Scalar::make(NotNan::new(0.6).expect("not nan"));
+        let t = RankedDifferentiable::of_slice(&to_not_nan_1([2.0, 1.0, 3.0]));
+
+        let result = relu(&t, &theta0, theta1).real_part().into_inner();
+
+        assert_eq!(result, 0.0);
+    }
+}
--- a/little_learner/src/expr_syntax_tree.rs
+++ b/little_learner/src/expr_syntax_tree.rs
@@ -1,155 +0,0 @@
-use immutable_chunkmap::map;
-use std::ops::{Add, Mul};
-
-/*
-An untyped syntax tree for an expression whose constants are all of type `A`.
-*/
-#[derive(Clone, Debug)]
-pub enum Expr<A> {
-    Const(A),
-    Sum(Box<Expr<A>>, Box<Expr<A>>),
-    Variable(u32),
-    // The first `Expr` here is a function, which may reference the input variable `Variable(i)`.
-    // For example, `(fun x y -> x + y) 3 4` is expressed as:
-    // Apply(0, Apply(1, Sum(Variable(0), Variable(1)), Const(4)), Const(3))
-    Apply(u32, Box<Expr<A>>, Box<Expr<A>>),
-    Mul(Box<Expr<A>>, Box<Expr<A>>),
-}
-
-impl<A> Expr<A> {
-    fn eval_inner<const SIZE: usize>(e: &Expr<A>, ctx: &map::Map<u32, A, SIZE>) -> A
-    where
-        A: Clone + Add<Output = A> + Mul<Output = A>,
-    {
-        match &e {
-            Expr::Const(x) => x.clone(),
-            Expr::Sum(x, y) => Expr::eval_inner(x, ctx) + Expr::eval_inner(y, ctx),
-            Expr::Variable(id) => ctx
-                .get(id)
-                .unwrap_or_else(|| panic!("No binding found for free variable {}", id))
-                .clone(),
-            Expr::Apply(variable, func, arg) => {
-                let arg = Expr::eval_inner(arg, ctx);
-                let (updated_context, _) = ctx.insert(*variable, arg);
-                Expr::eval_inner(func, &updated_context)
-            }
-            Expr::Mul(x, y) => Expr::eval_inner(x, ctx) * Expr::eval_inner(y, ctx),
-        }
-    }
-
-    pub fn eval<const MAX_VAR_NUM: usize>(e: &Expr<A>) -> A
-    where
-        A: Clone + Add<Output = A> + Mul<Output = A>,
-    {
-        Expr::eval_inner(e, &map::Map::<u32, A, MAX_VAR_NUM>::new())
-    }
-
-    pub fn apply(var: u32, f: Expr<A>, arg: Expr<A>) -> Expr<A> {
-        Expr::Apply(var, Box::new(f), Box::new(arg))
-    }
-
-    pub fn differentiate(one: &A, zero: &A, var: u32, f: &Expr<A>) -> Expr<A>
-    where
-        A: Clone,
-    {
-        match f {
-            Expr::Const(_) => Expr::Const(zero.clone()),
-            Expr::Sum(x, y) => {
-                Expr::differentiate(one, zero, var, x) + Expr::differentiate(one, zero, var, y)
-            }
-            Expr::Variable(i) => {
-                if *i == var {
-                    Expr::Const(one.clone())
-                } else {
-                    Expr::Const(zero.clone())
-                }
-            }
-            Expr::Mul(x, y) => {
-                Expr::Mul(
-                    Box::new(Expr::differentiate(one, zero, var, x.as_ref())),
-                    (*y).clone(),
-                ) + Expr::Mul(
-                    Box::new(Expr::differentiate(one, zero, var, y.as_ref())),
-                    (*x).clone(),
-                )
-            }
-            Expr::Apply(new_var, func, expr) => {
-                if *new_var == var {
-                    panic!(
-                        "cannot differentiate with respect to variable {} that's been assigned",
-                        var
-                    )
-                }
-                let expr_deriv = Expr::differentiate(one, zero, var, expr);
-                Expr::mul(
-                    expr_deriv,
-                    Expr::Apply(
-                        *new_var,
-                        Box::new(Expr::differentiate(one, zero, *new_var, func)),
-                        (*expr).clone(),
-                    ),
-                )
-            }
-        }
-    }
-}
-
-impl<A> Add for Expr<A> {
-    type Output = Expr<A>;
-    fn add(self: Expr<A>, y: Expr<A>) -> Expr<A> {
-        Expr::Sum(Box::new(self), Box::new(y))
-    }
-}
-
-impl<A> Mul for Expr<A> {
-    type Output = Expr<A>;
-    fn mul(self: Expr<A>, y: Expr<A>) -> Expr<A> {
-        Expr::Mul(Box::new(self), Box::new(y))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_expr() {
-        let expr = Expr::apply(
-            0,
-            Expr::apply(1, Expr::Variable(0) + Expr::Variable(1), Expr::Const(4)),
-            Expr::Const(3),
-        );
-
-        assert_eq!(Expr::eval::<2>(&expr), 7);
-    }
-
-    #[test]
-    fn test_derivative() {
-        let add_four = Expr::Variable(0) + Expr::Const(4);
-        let mul_five = Expr::Variable(1) * Expr::Const(5);
-
-        {
-            let mul_five_then_add_four = Expr::apply(0, add_four.clone(), mul_five.clone());
-            let mul_then_add_diff = Expr::differentiate(&1, &0, 1, &mul_five_then_add_four);
-            for i in 3..10 {
-                // (5x + 4) differentiates to 5
-                assert_eq!(
-                    Expr::eval::<2>(&Expr::apply(1, mul_then_add_diff.clone(), Expr::Const(i))),
-                    5
-                );
-            }
-        }
-
-        {
-            let add_four_then_mul_five = Expr::apply(1, mul_five.clone(), add_four.clone());
-            let add_then_mul_diff = Expr::differentiate(&1, &0, 0, &add_four_then_mul_five);
-            for i in 3..10 {
-                // ((x + 4) * 5) differentiates to 5
-                assert_eq!(
-                    Expr::eval::<2>(&Expr::apply(0, add_then_mul_diff.clone(), Expr::Const(i))),
-                    5
-                );
-            }
-        }
-    }
-}
--- a/little_learner/src/ext.rs
+++ b/little_learner/src/ext.rs
@@ -0,0 +1,442 @@
+use crate::auto_diff::{
+    Differentiable, DifferentiableTagged, RankedDifferentiable, RankedDifferentiableTagged,
+};
+use crate::decider::rectify;
+use crate::scalar::Scalar;
+use crate::traits::{NumLike, Zero};
+use std::iter::Sum;
+use std::ops::{Add, Mul};
+
+pub fn ext1<A, B, Tag, Tag2, F>(
+    n: usize,
+    f: &mut F,
+    t: &DifferentiableTagged<A, Tag>,
+) -> DifferentiableTagged<B, Tag2>
+where
+    F: FnMut(&DifferentiableTagged<A, Tag>) -> DifferentiableTagged<B, Tag2>,
+{
+    if t.rank() == n {
+        f(t)
+    } else {
+        t.map_once_tagged(|x| ext1(n, f, x))
+    }
+}
+
+pub fn ext2<A, B, C, Tag, Tag2, Tag3, F>(
+    n: usize,
+    m: usize,
+    f: &mut F,
+    t: &DifferentiableTagged<A, Tag>,
+    u: &DifferentiableTagged<B, Tag2>,
+) -> DifferentiableTagged<C, Tag3>
+where
+    F: FnMut(
+        &DifferentiableTagged<A, Tag>,
+        &DifferentiableTagged<B, Tag2>,
+    ) -> DifferentiableTagged<C, Tag3>,
+    A: Clone,
+    Tag: Clone,
+    B: Clone,
+    Tag2: Clone,
+{
+    if t.rank() == n && u.rank() == m {
+        f(t, u)
+    } else if t.rank() == n {
+        u.map_once_tagged(|eu| ext2(n, m, f, t, eu))
+    } else if u.rank() == m {
+        t.map_once_tagged(|et| ext2(n, m, f, et, u))
+    } else if t.rank() == u.rank() {
+        t.map2_once_tagged(u, |t, u| ext2(n, m, f, t, u))
+    } else if t.rank() > u.rank() {
+        t.map_once_tagged(|et| ext2(n, m, f, et, u))
+    } else {
+        u.map_once_tagged(|eu| ext2(n, m, f, t, eu))
+    }
+}
+
+pub fn elementwise_mul_via_ext<A, Tag, Tag2, const RANK1: usize, const RANK2: usize>(
+    x: &RankedDifferentiableTagged<A, Tag, RANK1>,
+    y: &RankedDifferentiableTagged<A, Tag2, RANK2>,
+) -> RankedDifferentiable<A, RANK1>
+where
+    A: Mul<Output = A> + Sum<<A as Mul>::Output> + Clone + Default,
+    Tag: Clone,
+    Tag2: Clone,
+{
+    ext2(
+        0,
+        0,
+        &mut |x, y| {
+            DifferentiableTagged::of_scalar(x.borrow_scalar().clone() * y.borrow_scalar().clone())
+        },
+        x.to_unranked_borrow(),
+        y.to_unranked_borrow(),
+    )
+    .attach_rank::<RANK1>()
+    .unwrap()
+}
+
+/// Produce the element-wise multiplication of the inputs, threading where necessary until the
+/// first argument has rank 2 and the second argument has rank 1.
+/// This is essentially "matrix-multiply a matrix by a vector, but don't do the sum; instead
+/// leave the components to be summed in a vector".
+pub fn star_2_1<T, Tag, Tag2>(
+    x: &DifferentiableTagged<T, Tag>,
+    y: &DifferentiableTagged<T, Tag2>,
+) -> Differentiable<T>
+where
+    T: Clone + Sum + Mul<Output = T> + Default,
+    Tag: Clone,
+    Tag2: Clone,
+{
+    ext2(
+        2,
+        1,
+        &mut |x, y| {
+            elementwise_mul_via_ext(
+                &x.clone().attach_rank::<2>().unwrap(),
+                &y.clone().attach_rank::<1>().unwrap(),
+            )
+            .to_unranked()
+        },
+        x,
+        y,
+    )
+}
+
+fn sum_1_scalar<A, Tag>(x: RankedDifferentiableTagged<A, Tag, 1>) -> Scalar<A>
+where
+    A: Sum<A> + Clone + Add<Output = A> + Zero,
+{
+    RankedDifferentiableTagged::to_vector(x)
+        .into_iter()
+        .map(|x| x.to_scalar())
+        .sum()
+}
+
+pub fn sum_1<A, Tag>(x: RankedDifferentiableTagged<A, Tag, 1>) -> Differentiable<A>
+where
+    A: Sum<A> + Clone + Add<Output = A> + Zero,
+{
+    DifferentiableTagged::of_scalar(sum_1_scalar(x))
+}
+
+pub fn sum<T>(x: &Differentiable<T>) -> Differentiable<T>
+where
+    T: Sum<T> + Clone + Add<Output = T> + Zero,
+{
+    ext1(1, &mut |y| sum_1(y.clone().attach_rank::<1>().unwrap()), x)
+}
+
+/// Matrix-multiply W with T, threading where necessary until the first argument has rank 2 and the
+/// second argument has rank 1.
+pub fn dot_2_1<A, Tag, Tag2>(
+    w: &DifferentiableTagged<A, Tag>,
+    t: &DifferentiableTagged<A, Tag2>,
+) -> Differentiable<A>
+where
+    A: NumLike + Default,
+    Tag: Clone,
+    Tag2: Clone,
+{
+    assert!(
+        w.rank() >= 2,
+        "w needed to have rank 2 or more, was {}",
+        w.rank()
+    );
+    assert!(
+        t.rank() >= 1,
+        "t needed to have rank 1 or more, was {}",
+        t.rank()
+    );
+    sum(&star_2_1(w, t))
+}
+
+pub fn linear<A, Tag1, Tag2, Tag3>(
+    theta0: &DifferentiableTagged<A, Tag1>,
+    theta1: &DifferentiableTagged<A, Tag2>,
+    t: &DifferentiableTagged<A, Tag3>,
+) -> DifferentiableTagged<A, ()>
+where
+    A: NumLike + Default,
+    Tag1: Clone,
+    Tag2: Clone,
+    Tag3: Clone,
+{
+    dot_2_1(theta0, t).map2_tagged(theta1, &mut |x, _, y, _| (x.clone() + y.clone(), ()))
+}
+
+pub fn relu<A, Tag1, Tag2, Tag3>(
+    t: &RankedDifferentiableTagged<A, Tag1, 1>,
+    theta0: &RankedDifferentiableTagged<A, Tag2, 2>,
+    theta1: &RankedDifferentiableTagged<A, Tag3, 1>,
+) -> Differentiable<A>
+where
+    A: NumLike + PartialOrd + Default,
+    Tag1: Clone,
+    Tag2: Clone,
+    Tag3: Clone,
+{
+    linear(
+        theta0.to_unranked_borrow(),
+        theta1.to_unranked_borrow(),
+        t.to_unranked_borrow(),
+    )
+    .map(&mut rectify)
+}
+
+pub fn k_relu<A, Tag>(
+    t: &RankedDifferentiableTagged<A, Tag, 1>,
+    theta: &[Differentiable<A>],
+) -> Differentiable<A>
+where
+    Tag: Clone,
+    A: NumLike + PartialOrd + Default,
+{
+    assert!(theta.len() < 2, "Needed at least 2 parameters for k_relu");
+    let once = relu(
+        t,
+        &theta[0].clone().attach_rank::<2>().unwrap(),
+        &theta[1].clone().attach_rank::<1>().unwrap(),
+    );
+    if theta.len() == 2 {
+        once
+    } else {
+        k_relu(&once.attach_rank().unwrap(), &theta[2..])
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::auto_diff::{Differentiable, RankedDifferentiable};
+    use crate::ext::{dot_2_1, ext1, relu, star_2_1};
+    use crate::not_nan::{to_not_nan_1, to_not_nan_2};
+    use crate::scalar::Scalar;
+    use crate::traits::Zero;
+    use ordered_float::NotNan;
+
+    fn zeros_redefined<A>(t: &Differentiable<A>) -> Differentiable<A>
+    where
+        A: Zero,
+    {
+        ext1(
+            0,
+            &mut |_| Differentiable::of_scalar(Scalar::make(A::zero())),
+            t,
+        )
+    }
+
+    #[test]
+    fn define_zeros() {
+        let shape = RankedDifferentiable::of_slice_2::<_, 2>(&to_not_nan_2([
+            [1.0, 2.0],
+            [3.0, 4.0],
+            [5.0, 6.0],
+        ]));
+        let zeros = zeros_redefined(&shape.to_unranked());
+        let to_zeros = zeros
+            .attach_rank::<2>()
+            .unwrap()
+            .to_vector()
+            .iter()
+            .map(|x| {
+                (*x).clone()
+                    .to_vector()
+                    .iter()
+                    .map(|x| (*x).clone().to_scalar().clone_real_part().into_inner())
+                    .collect::<Vec<_>>()
+            })
+            .collect::<Vec<_>>();
+        assert_eq!(to_zeros, [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])
+    }
+
+    fn flatten_2<A>(t: RankedDifferentiable<A, 2>) -> RankedDifferentiable<A, 1>
+    where
+        A: Clone,
+    {
+        let mut result = Vec::new();
+        for v in t.to_unranked_borrow().borrow_vector() {
+            result.extend((*v.borrow_vector()).clone())
+        }
+        Differentiable::of_vec(result).attach_rank::<1>().unwrap()
+    }
+
+    #[test]
+    fn test_flatten_2() {
+        let input = RankedDifferentiable::of_slice_2::<_, 2>(&to_not_nan_2([
+            [1.0, 0.5],
+            [3.1, 2.2],
+            [7.3, 2.1],
+        ]));
+        let flattened = flatten_2(input);
+        let reshaped = flattened
+            .to_vector()
+            .iter()
+            .map(|x| (*x).clone().to_scalar().clone_real_part().into_inner())
+            .collect::<Vec<_>>();
+        assert_eq!(reshaped, [1.0, 0.5, 3.1, 2.2, 7.3, 2.1])
+    }
+
+    #[test]
+    fn test_flatten() {
+        let flatten = |t: &Differentiable<NotNan<f64>>| {
+            ext1(
+                2,
+                &mut |t| flatten_2((*t).clone().attach_rank::<2>().unwrap()).to_unranked(),
+                t,
+            )
+        };
+        let input = RankedDifferentiable::of_vector(vec![
+            RankedDifferentiable::of_slice_2::<_, 2>(&to_not_nan_2([
+                [1.0, 0.5],
+                [3.1, 2.2],
+                [7.3, 2.1],
+            ])),
+            RankedDifferentiable::of_slice_2::<_, 2>(&to_not_nan_2([
+                [2.9, 3.5],
+                [0.7, 1.5],
+                [2.5, 6.4],
+            ])),
+        ]);
+
+        let flattened = flatten(&input.to_unranked())
+            .attach_rank::<2>()
+            .unwrap()
+            .to_vector()
+            .iter()
+            .map(|i| {
+                i.to_unranked_borrow()
+                    .borrow_vector()
+                    .iter()
+                    .map(|j| j.borrow_scalar().clone_real_part().into_inner())
+                    .collect::<Vec<_>>()
+            })
+            .collect::<Vec<_>>();
+
+        assert_eq!(
+            flattened,
+            [
+                [1.0, 0.5, 3.1, 2.2, 7.3, 2.1],
+                [2.9, 3.5, 0.7, 1.5, 2.5, 6.4]
+            ]
+        )
+    }
+
+    #[test]
+    fn test_star_2_1_a() {
+        let input1 = RankedDifferentiable::of_slice_2::<_, 2>(&to_not_nan_2([
+            [3.0, 4.0, 5.0],
+            [7.0, 8.0, 9.0],
+        ]));
+        let input2 = RankedDifferentiable::of_slice(&to_not_nan_1([2.0, 4.0, 3.0]));
+
+        let output = star_2_1(input1.to_unranked_borrow(), input2.to_unranked_borrow())
+            .into_vector()
+            .iter()
+            .map(|x| {
+                x.clone()
+                    .into_vector()
+                    .iter()
+                    .map(|i| i.clone().into_scalar().clone_real_part().into_inner())
+                    .collect::<Vec<_>>()
+            })
+            .collect::<Vec<_>>();
+
+        assert_eq!(output, [[6.0, 16.0, 15.0], [14.0, 32.0, 27.0]])
+    }
+
+    #[test]
+    fn test_star_2_1_b() {
+        let input1 = RankedDifferentiable::of_slice_2::<_, 2>(&to_not_nan_2([
+            [8.0, 1.0],
+            [7.0, 3.0],
+            [5.0, 4.0],
+        ]));
+        let input2 = RankedDifferentiable::of_slice_2::<_, 2>(&to_not_nan_2([
+            [6.0, 2.0],
+            [4.0, 9.0],
+            [3.0, 8.0],
+        ]));
+
+        let output = star_2_1(input1.to_unranked_borrow(), input2.to_unranked_borrow())
+            .into_vector()
+            .iter()
+            .map(|x| {
+                x.clone()
+                    .into_vector()
+                    .iter()
+                    .map(|i| {
+                        i.clone()
+                            .into_vector()
+                            .iter()
+                            .map(|i| i.borrow_scalar().clone_real_part().into_inner())
+                            .collect::<Vec<_>>()
+                    })
+                    .collect::<Vec<_>>()
+            })
+            .collect::<Vec<_>>();
+
+        assert_eq!(
+            output,
+            [
+                [[48.0, 2.0], [42.0, 6.0], [30.0, 8.0]],
+                [[32.0, 9.0], [28.0, 27.0], [20.0, 36.0]],
+                [[24.0, 8.0], [21.0, 24.0], [15.0, 32.0]]
+            ]
+        )
+    }
+
+    #[test]
+    fn test_dot_2_1() {
+        let w = RankedDifferentiable::of_slice_2::<_, 2>(&to_not_nan_2([
+            [2.0, 1.0, 3.1],
+            [3.7, 4.0, 6.1],
+        ]));
+        let t = RankedDifferentiable::of_slice(&to_not_nan_1([1.3, 0.4, 3.3]));
+
+        let result = dot_2_1(w.to_unranked_borrow(), t.to_unranked_borrow())
+            .attach_rank::<1>()
+            .unwrap()
+            .to_vector()
+            .iter()
+            .map(|x| x.clone().to_scalar().clone_real_part().into_inner())
+            .collect::<Vec<_>>();
+        assert_eq!(result, [13.23, 26.54])
+    }
+
+    #[test]
+    fn test_relu() {
+        let weights = to_not_nan_2([
+            [7.1, 4.3, -6.4],
+            [1.0, 2.0, 3.0],
+            [4.0, 5.0, 6.0],
+            [-1.3, -2.4, -3.6],
+        ]);
+        let biases = to_not_nan_1([10.2, 11.3, 12.4, 13.5]);
+        let inputs = to_not_nan_1([7.0, 8.0, 9.0]);
+        let theta0 = RankedDifferentiable::of_slice_2::<_, 2>(&weights);
+        let theta1 = RankedDifferentiable::of_slice(&biases);
+        let t = RankedDifferentiable::of_slice(&inputs);
+
+        let result = relu(&t, &theta0, &theta1)
+            .into_vector()
+            .iter()
+            .map(|x| x.borrow_scalar().clone_real_part().into_inner())
+            .collect::<Vec<_>>();
+
+        let mut expected = Vec::new();
+        for (weights, bias) in weights.iter().zip(biases.iter()) {
+            expected.push(
+                crate::decider::relu(
+                    &t,
+                    &RankedDifferentiable::of_slice(weights),
+                    Scalar::make(*bias),
+                )
+                .clone_real_part()
+                .into_inner(),
+            );
+        }
+
+        assert_eq!(result, expected);
+    }
+}
--- a/little_learner/src/gradient_descent.rs
+++ b/little_learner/src/gradient_descent.rs
@@ -0,0 +1,462 @@
+use crate::auto_diff::{grad, Differentiable, RankedDifferentiable};
+use crate::hyper;
+use crate::loss::l2_loss_2;
+use crate::predictor::Predictor;
+use crate::sample;
+use crate::traits::NumLike;
+use rand::Rng;
+use std::hash::Hash;
+
+fn iterate<A, F>(mut f: F, start: A, n: u32) -> A
+where
+    F: FnMut(A) -> A,
+{
+    let mut v = start;
+    for _ in 0..n {
+        v = f(v);
+    }
+    v
+}
+
+/// `adjust` takes the previous value and a delta, and returns a deflated new value.
+fn general_gradient_descent_step<
+    A,
+    F,
+    Inflated,
+    Deflate,
+    Adjust,
+    Hyper,
+    const RANK: usize,
+    const PARAM_NUM: usize,
+>(
+    f: &mut F,
+    theta: [Inflated; PARAM_NUM],
+    deflate: Deflate,
+    hyper: Hyper,
+    mut adjust: Adjust,
+) -> [Inflated; PARAM_NUM]
+where
+    A: Clone + NumLike + Hash + Eq,
+    F: FnMut(&[Differentiable<A>; PARAM_NUM]) -> RankedDifferentiable<A, RANK>,
+    Deflate: FnMut(Inflated) -> Differentiable<A>,
+    Inflated: Clone,
+    Hyper: Clone,
+    Adjust: FnMut(Inflated, &Differentiable<A>, Hyper) -> Inflated,
+{
+    let deflated = theta.clone().map(deflate);
+    let delta = grad(f, &deflated);
+    let mut i = 0;
+    theta.map(|inflated| {
+        let delta = &delta[i];
+        i += 1;
+        adjust(inflated, delta, hyper.clone())
+    })
+}
+
+pub fn gradient_descent<
+    'a,
+    T,
+    R,
+    Point,
+    F,
+    G,
+    H,
+    Inflated,
+    Hyper,
+    ImmutableHyper,
+    const IN_SIZE: usize,
+    const PARAM_NUM: usize,
+>(
+    hyper: Hyper,
+    xs: &'a [Point],
+    to_ranked_differentiable: G,
+    ys: &[T],
+    zero_params: [Differentiable<T>; PARAM_NUM],
+    mut predictor: Predictor<F, Inflated, Differentiable<T>, ImmutableHyper>,
+    to_immutable: H,
+) -> [Differentiable<T>; PARAM_NUM]
+where
+    T: NumLike + Hash + Copy + Default,
+    Point: 'a + Copy,
+    F: Fn(
+        RankedDifferentiable<T, IN_SIZE>,
+        &[Differentiable<T>; PARAM_NUM],
+    ) -> RankedDifferentiable<T, 1>,
+    G: for<'b> Fn(&'b [Point]) -> RankedDifferentiable<T, IN_SIZE>,
+    Inflated: Clone,
+    ImmutableHyper: Clone,
+    Hyper: Into<hyper::BaseGradientDescent<R>>,
+    H: FnOnce(&Hyper) -> ImmutableHyper,
+    R: Rng,
+{
+    let sub_hypers = to_immutable(&hyper);
+    let mut gradient_hyper: hyper::BaseGradientDescent<R> = hyper.into();
+    let iterations = gradient_hyper.iterations;
+    let out = iterate(
+        |theta| {
+            general_gradient_descent_step(
+                &mut |x| match gradient_hyper.sampling.as_mut() {
+                    None => RankedDifferentiable::of_vector(vec![RankedDifferentiable::of_scalar(
+                        l2_loss_2(
+                            &predictor.predict,
+                            to_ranked_differentiable(xs),
+                            RankedDifferentiable::of_slice(ys),
+                            x,
+                        ),
+                    )]),
+                    Some((rng, batch_size)) => {
+                        let (sampled_xs, sampled_ys) = sample::take_2(rng, *batch_size, xs, ys);
+                        RankedDifferentiable::of_vector(vec![RankedDifferentiable::of_scalar(
+                            l2_loss_2(
+                                &predictor.predict,
+                                to_ranked_differentiable(&sampled_xs),
+                                RankedDifferentiable::of_slice(&sampled_ys),
+                                x,
+                            ),
+                        )])
+                    }
+                },
+                theta,
+                predictor.deflate,
+                sub_hypers.clone(),
+                predictor.update,
+            )
+        },
+        zero_params.map(predictor.inflate),
+        iterations,
+    );
+    out.map(&mut predictor.deflate)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::auto_diff::RankedDifferentiableTagged;
+    use crate::hyper;
+    use crate::loss::{predict_line_2_unranked, predict_plane, predict_quadratic_unranked};
+    use crate::not_nan::{to_not_nan_1, to_not_nan_2};
+    use crate::predictor;
+    use crate::scalar::Scalar;
+    use crate::traits::Zero;
+    use ordered_float::NotNan;
+    use rand::rngs::StdRng;
+    use rand::SeedableRng;
+
+    #[test]
+    fn test_iterate() {
+        let f = |t: [i32; 3]| t.map(|i| i - 3);
+        assert_eq!(iterate(f, [1, 2, 3], 5u32), [-14, -13, -12]);
+    }
+
+    #[test]
+    fn first_optimisation_test() {
+        let xs = [2.0, 1.0, 4.0, 3.0];
+        let ys = [1.8, 1.2, 4.2, 3.3];
+
+        let zero = Scalar::<NotNan<f64>>::zero();
+
+        let hyper = hyper::NakedGradientDescent::new(NotNan::new(0.01).expect("not nan"), 1000);
+        let iterated = {
+            let xs = to_not_nan_1(xs);
+            let ys = to_not_nan_1(ys);
+            let zero_params = [
+                RankedDifferentiable::of_scalar(zero.clone()).to_unranked(),
+                RankedDifferentiable::of_scalar(zero).to_unranked(),
+            ];
+            gradient_descent(
+                hyper,
+                &xs,
+                |b| RankedDifferentiable::of_slice(b),
+                &ys,
+                zero_params,
+                predictor::naked(predict_line_2_unranked),
+                hyper::NakedGradientDescent::to_immutable,
+            )
+        };
+        let iterated = iterated
+            .into_iter()
+            .map(|x| x.into_scalar().real_part().into_inner())
+            .collect::<Vec<_>>();
+
+        assert_eq!(iterated, vec![1.0499993623489503, 0.0000018747718457656533]);
+    }
+
+    #[test]
+    fn optimise_quadratic() {
+        let xs = [-1.0, 0.0, 1.0, 2.0, 3.0];
+        let ys = [2.55, 2.1, 4.35, 10.2, 18.25];
+
+        let zero = Scalar::<NotNan<f64>>::zero();
+
+        let hyper = hyper::NakedGradientDescent::new(NotNan::new(0.001).expect("not nan"), 1000);
+
+        let iterated = {
+            let xs = to_not_nan_1(xs);
+            let ys = to_not_nan_1(ys);
+            let zero_params = [
+                RankedDifferentiable::of_scalar(zero.clone()).to_unranked(),
+                RankedDifferentiable::of_scalar(zero.clone()).to_unranked(),
+                RankedDifferentiable::of_scalar(zero).to_unranked(),
+            ];
+            gradient_descent(
+                hyper,
+                &xs,
+                |b| RankedDifferentiable::of_slice(b),
+                &ys,
+                zero_params,
+                predictor::naked(predict_quadratic_unranked),
+                hyper::NakedGradientDescent::to_immutable,
+            )
+        };
+        let iterated = iterated
+            .into_iter()
+            .map(|x| x.into_scalar().real_part().into_inner())
+            .collect::<Vec<_>>();
+
+        assert_eq!(
+            iterated,
+            [2.0546423148479684, 0.9928606519360353, 1.4787394427094362]
+        );
+    }
+
+    const PLANE_XS: [[f64; 2]; 6] = [
+        [1.0, 2.05],
+        [1.0, 3.0],
+        [2.0, 2.0],
+        [2.0, 3.91],
+        [3.0, 6.13],
+        [4.0, 8.09],
+    ];
+    const PLANE_YS: [f64; 6] = [13.99, 15.99, 18.0, 22.4, 30.2, 37.94];
+
+    #[test]
+    fn optimise_plane() {
+        let hyper = hyper::NakedGradientDescent::new(NotNan::new(0.001).expect("not nan"), 1000);
+
+        let iterated = {
+            let xs = to_not_nan_2(PLANE_XS);
+            let ys = to_not_nan_1(PLANE_YS);
+            let zero_params = [
+                RankedDifferentiable::of_slice(&[NotNan::zero(), NotNan::zero()]).to_unranked(),
+                Differentiable::of_scalar(Scalar::zero()),
+            ];
+            gradient_descent(
+                hyper,
+                &xs,
+                RankedDifferentiable::of_slice_2::<_, 2>,
+                &ys,
+                zero_params,
+                predictor::naked(predict_plane),
+                hyper::NakedGradientDescent::to_immutable,
+            )
+        };
+
+        let [theta0, theta1] = iterated;
+
+        let theta0 = theta0.attach_rank::<1>().expect("rank 1 tensor");
+        let theta1 = theta1.attach_rank::<0>().expect("rank 0 tensor");
+
+        assert_eq!(theta0.collect(), [3.97757644609063, 2.0496557321494446]);
+        assert_eq!(
+            theta1.to_scalar().real_part().into_inner(),
+            5.786758464448078
+        );
+    }
+
+    #[test]
+    fn optimise_plane_with_sampling() {
+        let rng = StdRng::seed_from_u64(314159);
+        let hyper = hyper::NakedGradientDescent::new(NotNan::new(0.001).expect("not nan"), 1000)
+            .with_rng(rng, 4);
+
+        let iterated = {
+            let xs = to_not_nan_2(PLANE_XS);
+            let ys = to_not_nan_1(PLANE_YS);
+            let zero_params = [
+                RankedDifferentiable::of_slice(&[NotNan::zero(), NotNan::zero()]).to_unranked(),
+                Differentiable::of_scalar(Scalar::zero()),
+            ];
+            gradient_descent(
+                hyper,
+                &xs,
+                RankedDifferentiable::of_slice_2::<_, 2>,
+                &ys,
+                zero_params,
+                predictor::naked(predict_plane),
+                hyper::NakedGradientDescent::to_immutable,
+            )
+        };
+
+        let [theta0, theta1] = iterated;
+
+        let theta0 = theta0.attach_rank::<1>().expect("rank 1 tensor").collect();
+        let theta1 = theta1
+            .attach_rank::<0>()
+            .expect("rank 0 tensor")
+            .to_scalar()
+            .real_part()
+            .into_inner();
+
+        /*
+           Mathematica code to verify by eye that the optimisation gave a reasonable result:
+
+        xs = {{1.0, 2.05}, {1.0, 3.0}, {2.0, 2.0}, {2.0, 3.91}, {3.0,
+            6.13}, {4.0, 8.09}};
+        ys = {13.99, 15.99, 18.0, 22.4, 30.2, 37.94};
+        points = ListPointPlot3D[Append @@@ Transpose[{xs, ys}]];
+
+        withoutBatching0 = {3.97757644609063, 2.0496557321494446};
+        withoutBatching1 = 5.2839863438547159;
+        withoutBatching =
+            Plot3D[{x, y} . withoutBatching0 + withoutBatching1, {x, 0, 4}, {y,
+            0, 8}];
+
+        withBatching0 = {3.8581694055684781, 2.2166222673968554};
+        withBatching1 = 5.2399202468216668;
+        withBatching =
+            Plot3D[{x, y} . withBatching0 + withBatching1, {x, 0, 4}, {y, 0, 8}];
+
+        Show[points, withoutBatching]
+
+        Show[points, withBatching]
+         */
+
+        assert_eq!(theta0, [3.858_169_405_568_478, 2.2166222673968554]);
+        assert_eq!(theta1, 5.283_986_343_854_716);
+    }
+
+    #[test]
+    fn test_with_velocity() {
+        let hyper = hyper::VelocityGradientDescent::zero_momentum(
+            NotNan::new(0.001).expect("not nan"),
+            1000,
+        )
+        .with_mu(NotNan::new(0.9).expect("not nan"));
+
+        let iterated = {
+            let xs = to_not_nan_2(PLANE_XS);
+            let ys = to_not_nan_1(PLANE_YS);
+            let zero_params = [
+                RankedDifferentiable::of_slice(&[NotNan::<f64>::zero(), NotNan::<f64>::zero()])
+                    .to_unranked(),
+                Differentiable::of_scalar(Scalar::zero()),
+            ];
+
+            gradient_descent(
+                hyper,
+                &xs,
+                RankedDifferentiableTagged::of_slice_2::<_, 2>,
+                &ys,
+                zero_params,
+                predictor::velocity(predict_plane),
+                hyper::VelocityGradientDescent::to_immutable,
+            )
+        };
+
+        let [theta0, theta1] = iterated;
+
+        let theta0 = theta0.attach_rank::<1>().expect("rank 1 tensor");
+        let theta1 = theta1.attach_rank::<0>().expect("rank 0 tensor");
+
+        assert_eq!(theta0.collect(), [3.979645447136021, 1.976454920954754]);
+        assert_eq!(
+            theta1.to_scalar().real_part().into_inner(),
+            6.169579045974949
+        );
+    }
+
+    #[test]
+    fn test_with_rms() {
+        let beta = NotNan::new(0.9).expect("not nan");
+        let stabilizer = NotNan::new(0.00000001).expect("not nan");
+        let hyper = hyper::RmsGradientDescent::default(NotNan::new(0.01).expect("not nan"), 3000)
+            .with_stabilizer(stabilizer)
+            .with_beta(beta);
+
+        let iterated = {
+            let xs = to_not_nan_2(PLANE_XS);
+            let ys = to_not_nan_1(PLANE_YS);
+            let zero_params = [
+                RankedDifferentiable::of_slice(&[NotNan::<f64>::zero(), NotNan::<f64>::zero()])
+                    .to_unranked(),
+                Differentiable::of_scalar(Scalar::zero()),
+            ];
+
+            gradient_descent(
+                hyper,
+                &xs,
+                RankedDifferentiableTagged::of_slice_2::<_, 2>,
+                &ys,
+                zero_params,
+                predictor::rms(predict_plane),
+                hyper::RmsGradientDescent::to_immutable,
+            )
+        };
+
+        let [theta0, theta1] = iterated;
+
+        let theta0 = theta0.attach_rank::<1>().expect("rank 1 tensor");
+        let theta1 = theta1.attach_rank::<0>().expect("rank 0 tensor");
+
+        let fitted_theta0 = theta0
+            .collect()
+            .iter()
+            .map(|x| x.into_inner())
+            .collect::<Vec<_>>();
+        let fitted_theta1 = theta1.to_scalar().real_part().into_inner();
+        assert_eq!(
+            fitted_theta0,
+            [3.974_645_444_172_085, 1.971_454_922_077_495]
+        );
+        assert_eq!(fitted_theta1, 6.164_579_048_274_036);
+    }
+
+    #[test]
+    fn test_with_adam() {
+        let beta = NotNan::new(0.9).expect("not nan");
+        let stabilizer = NotNan::new(0.00000001).expect("not nan");
+        let mu = NotNan::new(0.85).expect("not nan");
+        // Erratum in the book: they printed 0.001 but intended 0.01.
+        let hyper = hyper::AdamGradientDescent::default(NotNan::new(0.01).expect("not nan"), 1500)
+            .with_stabilizer(stabilizer)
+            .with_beta(beta)
+            .with_mu(mu);
+
+        let iterated = {
+            let xs = to_not_nan_2(PLANE_XS);
+            let ys = to_not_nan_1(PLANE_YS);
+            let zero_params = [
+                RankedDifferentiable::of_slice(&[NotNan::<f64>::zero(), NotNan::<f64>::zero()])
+                    .to_unranked(),
+                Differentiable::of_scalar(Scalar::zero()),
+            ];
+
+            gradient_descent(
+                hyper,
+                &xs,
+                RankedDifferentiableTagged::of_slice_2::<_, 2>,
+                &ys,
+                zero_params,
+                predictor::adam(predict_plane),
+                hyper::AdamGradientDescent::to_immutable,
+            )
+        };
+
+        let [theta0, theta1] = iterated;
+
+        let theta0 = theta0.attach_rank::<1>().expect("rank 1 tensor");
+        let theta1 = theta1.attach_rank::<0>().expect("rank 0 tensor");
+
+        let fitted_theta0 = theta0
+            .collect()
+            .iter()
+            .map(|x| x.into_inner())
+            .collect::<Vec<_>>();
+        let fitted_theta1 = theta1.to_scalar().real_part().into_inner();
+        assert_eq!(
+            fitted_theta0,
+            [3.980_262_420_345_729_5, 1.977_071_898_301_444]
+        );
+        assert_eq!(fitted_theta1, 6.170_196_024_282_712_5);
+    }
+}
--- a/little_learner/src/hyper.rs
+++ b/little_learner/src/hyper.rs
@@ -0,0 +1,265 @@
+use crate::predictor::{AdamHyper, NakedHypers, RmsHyper, VelocityHypers};
+use crate::traits::{NumLike, Zero};
+use rand::rngs::StdRng;
+
+/// Hyperparameters which apply to any possible optimisation algorithm that uses gradient descent.
+pub struct BaseGradientDescent<Rng> {
+    pub sampling: Option<(Rng, usize)>,
+    pub iterations: u32,
+}
+
+impl BaseGradientDescent<StdRng> {
+    #[must_use]
+    pub fn new(iterations: u32) -> BaseGradientDescent<StdRng> {
+        BaseGradientDescent {
+            sampling: None,
+            iterations,
+        }
+    }
+}
+
+impl<Rng> BaseGradientDescent<Rng> {
+    #[must_use]
+    pub fn with_rng<Rng2>(self, rng: Rng2, size: usize) -> BaseGradientDescent<Rng2> {
+        BaseGradientDescent {
+            iterations: self.iterations,
+            sampling: Some((rng, size)),
+        }
+    }
+
+    #[must_use]
+    pub fn with_iterations(self, n: u32) -> Self {
+        BaseGradientDescent {
+            sampling: self.sampling,
+            iterations: n,
+        }
+    }
+}
+
+pub struct NakedGradientDescent<A, Rng> {
+    base: BaseGradientDescent<Rng>,
+    naked: NakedHypers<A>,
+}
+
+impl<A> NakedGradientDescent<A, StdRng>
+where
+    A: Zero,
+{
+    #[must_use]
+    pub fn new(learning_rate: A, iterations: u32) -> Self {
+        NakedGradientDescent {
+            base: BaseGradientDescent::new(iterations),
+            naked: NakedHypers { learning_rate },
+        }
+    }
+}
+
+impl<A, Rng> NakedGradientDescent<A, Rng> {
+    pub fn to_immutable(&self) -> NakedHypers<A>
+    where
+        A: Clone,
+    {
+        self.naked.clone()
+    }
+
+    #[must_use]
+    pub fn with_rng<Rng2>(self, rng: Rng2, size: usize) -> NakedGradientDescent<A, Rng2> {
+        NakedGradientDescent {
+            base: self.base.with_rng(rng, size),
+            naked: self.naked,
+        }
+    }
+}
+
+impl<A, Rng> From<NakedGradientDescent<A, Rng>> for BaseGradientDescent<Rng> {
+    fn from(val: NakedGradientDescent<A, Rng>) -> BaseGradientDescent<Rng> {
+        val.base
+    }
+}
+
+pub struct VelocityGradientDescent<A, Rng> {
+    base: BaseGradientDescent<Rng>,
+    velocity: VelocityHypers<A>,
+}
+
+impl<A> VelocityGradientDescent<A, StdRng>
+where
+    A: Zero,
+{
+    #[must_use]
+    pub fn zero_momentum(learning_rate: A, iterations: u32) -> Self {
+        VelocityGradientDescent {
+            base: BaseGradientDescent::new(iterations),
+            velocity: VelocityHypers {
+                learning_rate,
+                mu: A::zero(),
+            },
+        }
+    }
+}
+
+impl<A, Rng> VelocityGradientDescent<A, Rng> {
+    #[must_use]
+    pub fn with_mu(self, mu: A) -> Self {
+        VelocityGradientDescent {
+            base: self.base,
+            velocity: VelocityHypers {
+                learning_rate: self.velocity.learning_rate,
+                mu,
+            },
+        }
+    }
+
+    pub fn to_immutable(&self) -> VelocityHypers<A>
+    where
+        A: Clone,
+    {
+        self.velocity.clone()
+    }
+}
+
+impl<A, Rng> From<VelocityGradientDescent<A, Rng>> for BaseGradientDescent<Rng> {
+    fn from(val: VelocityGradientDescent<A, Rng>) -> BaseGradientDescent<Rng> {
+        val.base
+    }
+}
+
+fn ten<A>() -> A
+where
+    A: NumLike,
+{
+    let two = A::one() + A::one();
+    two.clone() * two.clone() * two.clone() + two
+}
+
+fn one_ten_k<A>() -> A
+where
+    A: NumLike,
+{
+    let one_tenth = A::one() / ten();
+    let one_hundredth = one_tenth.clone() * one_tenth;
+    one_hundredth.clone() * one_hundredth
+}
+
+pub struct RmsGradientDescent<A, Rng> {
+    base: BaseGradientDescent<Rng>,
+    rms: RmsHyper<A>,
+}
+
+impl<A> RmsGradientDescent<A, StdRng> {
+    pub fn default(learning_rate: A, iterations: u32) -> Self
+    where
+        A: NumLike,
+    {
+        RmsGradientDescent {
+            base: BaseGradientDescent::new(iterations),
+            rms: RmsHyper {
+                stabilizer: one_ten_k::<A>() * one_ten_k(),
+                beta: A::one() + -(A::one() / ten()),
+                learning_rate,
+            },
+        }
+    }
+}
+
+impl<A, Rng> RmsGradientDescent<A, Rng> {
+    #[must_use]
+    pub fn with_stabilizer(self, stabilizer: A) -> Self {
+        RmsGradientDescent {
+            base: self.base,
+            rms: RmsHyper {
+                stabilizer,
+                beta: self.rms.beta,
+                learning_rate: self.rms.learning_rate,
+            },
+        }
+    }
+
+    #[must_use]
+    pub fn with_beta(self, beta: A) -> Self {
+        RmsGradientDescent {
+            base: self.base,
+            rms: RmsHyper {
+                stabilizer: self.rms.stabilizer,
+                beta,
+                learning_rate: self.rms.learning_rate,
+            },
+        }
+    }
+
+    pub fn to_immutable(&self) -> RmsHyper<A>
+    where
+        A: Clone,
+    {
+        self.rms.clone()
+    }
+}
+
+impl<A, Rng> From<RmsGradientDescent<A, Rng>> for BaseGradientDescent<Rng> {
+    fn from(val: RmsGradientDescent<A, Rng>) -> BaseGradientDescent<Rng> {
+        val.base
+    }
+}
+
+pub struct AdamGradientDescent<A, Rng> {
+    base: BaseGradientDescent<Rng>,
+    adam: AdamHyper<A>,
+}
+
+impl<A> AdamGradientDescent<A, StdRng> {
+    pub fn default(learning_rate: A, iterations: u32) -> Self
+    where
+        A: NumLike,
+    {
+        AdamGradientDescent {
+            base: BaseGradientDescent::new(iterations),
+            adam: AdamHyper {
+                mu: A::zero(),
+                rms: RmsHyper {
+                    learning_rate,
+                    stabilizer: one_ten_k::<A>() * one_ten_k(),
+                    beta: A::one() + -(A::one() / ten()),
+                },
+            },
+        }
+    }
+}
+
+impl<A, Rng> AdamGradientDescent<A, Rng> {
+    #[must_use]
+    pub fn with_stabilizer(self, stabilizer: A) -> Self {
+        AdamGradientDescent {
+            base: self.base,
+            adam: self.adam.with_stabilizer(stabilizer),
+        }
+    }
+
+    #[must_use]
+    pub fn with_beta(self, beta: A) -> Self {
+        AdamGradientDescent {
+            base: self.base,
+            adam: self.adam.with_beta(beta),
+        }
+    }
+
+    #[must_use]
+    pub fn with_mu(self, mu: A) -> Self {
+        AdamGradientDescent {
+            base: self.base,
+            adam: self.adam.with_mu(mu),
+        }
+    }
+
+    pub fn to_immutable(&self) -> AdamHyper<A>
+    where
+        A: Clone,
+    {
+        self.adam.clone()
+    }
+}
+
+impl<A, Rng> From<AdamGradientDescent<A, Rng>> for BaseGradientDescent<Rng> {
+    fn from(val: AdamGradientDescent<A, Rng>) -> BaseGradientDescent<Rng> {
+        val.base
+    }
+}
--- a/little_learner/src/layer.rs
+++ b/little_learner/src/layer.rs
@@ -0,0 +1,75 @@
+use crate::auto_diff::{Differentiable, RankedDifferentiable, RankedDifferentiableTagged};
+use crate::decider::relu;
+use crate::traits::NumLike;
+
+/// Returns a tensor1.
+/// Theta has two components: a tensor2 of weights and a tensor1 of bias.
+pub fn layer<T>(
+    theta: Differentiable<T>,
+    t: RankedDifferentiable<T, 1>,
+) -> RankedDifferentiable<T, 1>
+where
+    T: NumLike + PartialOrd,
+{
+    let mut theta = theta.into_vector();
+    assert_eq!(theta.len(), 2, "Needed weights and a bias");
+    let b = theta.pop().unwrap().attach_rank::<1>().unwrap();
+    let w = theta.pop().unwrap().attach_rank::<2>().unwrap();
+
+    RankedDifferentiableTagged::map2_once(
+        &w,
+        &b,
+        &mut |w: &RankedDifferentiable<_, 1>, b: &RankedDifferentiable<_, 0>| {
+            RankedDifferentiableTagged::of_scalar(relu(&t, w, b.clone().to_scalar()))
+        },
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::auto_diff::{Differentiable, RankedDifferentiable};
+    use crate::layer::layer;
+    use crate::not_nan::{to_not_nan_1, to_not_nan_2};
+
+    #[test]
+    fn test_single_layer() {
+        let b = RankedDifferentiable::of_slice(&to_not_nan_1([1.0, 2.0]));
+        let w = RankedDifferentiable::of_slice_2::<_, 2>(&to_not_nan_2([
+            [3.0, 4.0, 5.0],
+            [6.0, 7.0, 8.0],
+        ]));
+        let theta = Differentiable::of_vec(vec![w.to_unranked(), b.to_unranked()]);
+
+        /*
+        Two neurons:
+        w =
+          (3 4 5
+           6 7 8)
+        b = (1, 2)
+
+        Three inputs:
+        t = (9, 10, 11)
+
+        Output has two elements, one per neuron.
+        Neuron 1 has weights (3,4,5) and bias 1;
+        Neuron 2 has weights (6,7,8) and bias 2.
+
+        Neuron 1 is relu(t, (3,4,5), 1), which is (9, 10, 11).(3, 4, 5) + 1.
+        Neuron 2 is relu(t, (6,7,8), 2), which is (9, 10, 11).(6, 7, 8) + 2.
+         */
+
+        let t = RankedDifferentiable::of_slice(&to_not_nan_1([9.0, 10.0, 11.0]));
+        let mut output = layer(theta, t)
+            .to_vector()
+            .iter()
+            .map(|t| (*t).clone().to_scalar().clone_real_part().into_inner())
+            .collect::<Vec<_>>();
+
+        assert_eq!(output.len(), 2);
+        let result_2 = output.pop().unwrap();
+        let result_1 = output.pop().unwrap();
+
+        assert_eq!(result_1, (9 * 3 + 10 * 4 + 11 * 5 + 1) as f64);
+        assert_eq!(result_2, (9 * 6 + 10 * 7 + 11 * 8 + 2) as f64);
+    }
+}
--- a/little_learner/src/lib.rs
+++ b/little_learner/src/lib.rs
@@ -1,11 +1,19 @@
 #![allow(incomplete_features)]
 #![feature(generic_const_exprs)]
 #![feature(array_methods)]
+#![feature(closure_lifetime_binder)]

 pub mod auto_diff;
-pub mod const_teq;
-pub mod expr_syntax_tree;
+pub mod block;
+pub mod decider;
+pub mod ext;
+pub mod gradient_descent;
+pub mod hyper;
+pub mod layer;
 pub mod loss;
+pub mod not_nan;
+pub mod predictor;
+pub mod sample;
 pub mod scalar;
-pub mod tensor;
+pub mod smooth;
 pub mod traits;
--- a/little_learner/src/loss.rs
+++ b/little_learner/src/loss.rs
@@ -3,8 +3,10 @@ use std::{
    ops::{Add, Mul, Neg},
 };

+use crate::auto_diff::{Differentiable, RankedDifferentiableTagged};
+use crate::ext::{sum, sum_1};
 use crate::{
-    auto_diff::{Differentiable, RankedDifferentiable},
+    auto_diff::{DifferentiableTagged, RankedDifferentiable},
    scalar::Scalar,
    traits::{One, Zero},
 };
@@ -23,14 +25,47 @@ pub fn elementwise_mul<A, const RANK: usize>(
 where
    A: Mul<Output = A> + Sum<<A as Mul>::Output> + Clone + Default,
 {
-    RankedDifferentiable::map2(x, y, &|x, y| x.clone() * y.clone())
+    RankedDifferentiable::map2(x, y, &mut |x, y| x.clone() * y.clone())
+}
+
+pub fn dot_unranked_tagged<A, Tag1, Tag2, Tag3, F>(
+    x: &DifferentiableTagged<A, Tag1>,
+    y: &DifferentiableTagged<A, Tag2>,
+    mut combine_tags: F,
+) -> DifferentiableTagged<A, Tag3>
+where
+    A: Mul<Output = A> + Sum<<A as Mul>::Output> + Clone + Default,
+    F: FnMut(Tag1, Tag2) -> Tag3,
+    Tag1: Clone,
+    Tag2: Clone,
+{
+    DifferentiableTagged::map2_tagged(x, y, &mut |x, tag1, y, tag2| {
+        (x.clone() * y.clone(), combine_tags(tag1, tag2))
+    })
 }

 pub fn dot_unranked<A>(x: &Differentiable<A>, y: &Differentiable<A>) -> Differentiable<A>
 where
    A: Mul<Output = A> + Sum<<A as Mul>::Output> + Clone + Default,
 {
-    Differentiable::map2(x, y, &|x, y| x.clone() * y.clone())
+    dot_unranked_tagged(x, y, |(), ()| ())
+}
+
+pub fn dot<A, Tag1, Tag2>(
+    x: &RankedDifferentiableTagged<A, Tag1, 1>,
+    y: &RankedDifferentiableTagged<A, Tag2, 1>,
+) -> Scalar<A>
+where
+    A: Mul<Output = A> + Sum + Clone + Add<Output = A> + Zero,
+{
+    // Much sadness - find a way to get rid of these clones
+    let x = x.map_tag(&mut |_| ());
+    let y = y.map_tag(&mut |_| ());
+    x.to_vector()
+        .iter()
+        .zip(y.to_vector().iter())
+        .map(|(x, y)| x.clone().to_scalar() * y.clone().to_scalar())
+        .sum()
 }

 fn squared_2<A, const RANK: usize>(
@@ -39,17 +74,7 @@ fn squared_2<A, const RANK: usize>(
 where
    A: Mul<Output = A> + Copy + Default,
 {
-    RankedDifferentiable::map2(x, x, &|x, y| x.clone() * y.clone())
-}
-
-fn sum_2<A>(x: RankedDifferentiable<A, 1>) -> Scalar<A>
-where
-    A: Sum<A> + Clone + Add<Output = A> + Zero,
-{
-    RankedDifferentiable::to_vector(x)
-        .into_iter()
-        .map(|x| x.to_scalar())
-        .sum()
+    RankedDifferentiable::map2(x, x, &mut |x, y| x.clone() * y.clone())
 }

 fn l2_norm_2<A>(
@@ -59,8 +84,8 @@ fn l2_norm_2<A>(
 where
    A: Sum<A> + Mul<Output = A> + Copy + Default + Neg<Output = A> + Add<Output = A> + Zero + Neg,
 {
-    let diff = RankedDifferentiable::map2(prediction, data, &|x, y| x.clone() - y.clone());
-    sum_2(squared_2(&diff))
+    let diff = RankedDifferentiable::map2(prediction, data, &mut |x, y| x.clone() - y.clone());
+    sum_1(squared_2(&diff)).into_scalar()
 }

 pub fn l2_loss_2<A, F, Params, const N: usize>(
@@ -126,7 +151,7 @@ where
        let dotted = RankedDifferentiable::of_scalar(
            dot_unranked(
                left_arg.to_unranked_borrow(),
-                &Differentiable::of_vec(theta.to_vec()),
+                &DifferentiableTagged::of_vec(theta.to_vec()),
            )
            .into_vector()
            .into_iter()
@@ -180,7 +205,7 @@ where
        );
        dot_unranked(
            x_powers.to_unranked_borrow(),
-            &Differentiable::of_vec(theta.to_vec()),
+            &DifferentiableTagged::of_vec(theta.to_vec()),
        )
        .attach_rank::<1>()
        .expect("wanted a tensor1")
@@ -191,7 +216,10 @@ where
    })
 }

-// The parameters are: a tensor1 of length 2 (to be dotted with the input), and a scalar (to translate).
+/// The parameters are: a tensor1 of length 2 (to be dotted with the input), and a scalar (to translate).
+///
+/// # Panics
+/// Panics if the input `theta` is not of rank 1 consisting of a tensor1 and a scalar.
 pub fn predict_plane<A>(
    xs: RankedDifferentiable<A, 2>,
    theta: &[Differentiable<A>; 2],
@@ -199,9 +227,12 @@ pub fn predict_plane<A>(
 where
    A: Mul<Output = A> + Add<Output = A> + Sum + Default + One + Zero + Clone,
 {
-    if theta[0].rank() != 1 {
-        panic!("theta0 must be of rank 1, got: {}", theta[0].rank())
-    }
+    assert_eq!(
+        theta[0].rank(),
+        1,
+        "theta0 must be of rank 1, got: {}",
+        theta[0].rank()
+    );
    let theta0 = RankedDifferentiable::of_vector(
        theta[0]
            .borrow_vector()
@@ -209,12 +240,41 @@ where
            .map(|v| RankedDifferentiable::of_scalar(v.borrow_scalar().clone()))
            .collect::<Vec<_>>(),
    );
-    let theta1 = theta[1].borrow_scalar().clone();
+    let theta1 = theta[1].clone().attach_rank::<0>().unwrap();
    let dotted: Vec<_> = xs
        .to_vector()
        .into_iter()
-        .map(|point| sum_2(elementwise_mul(&theta0, &point)))
-        .map(|x| RankedDifferentiable::of_scalar(x + theta1.clone()))
+        .map(|point| {
+            sum(elementwise_mul(&theta0, &point).to_unranked_borrow())
+                .attach_rank::<0>()
+                .unwrap()
+        })
+        .map(|x| x.map2(&theta1, &mut |x, theta| x.clone() + theta.clone()))
        .collect();
    RankedDifferentiable::of_vector(dotted)
 }
+
+#[cfg(test)]
+mod test_loss {
+    use crate::auto_diff::RankedDifferentiable;
+    use crate::loss::{l2_loss_2, predict_line_2};
+    use crate::scalar::Scalar;
+    use crate::traits::Zero;
+
+    #[test]
+    fn loss_example() {
+        let xs = [2.0, 1.0, 4.0, 3.0];
+        let ys = [1.8, 1.2, 4.2, 3.3];
+        let loss = l2_loss_2(
+            predict_line_2,
+            RankedDifferentiable::of_slice(&xs),
+            RankedDifferentiable::of_slice(&ys),
+            &[
+                RankedDifferentiable::of_scalar(Scalar::zero()),
+                RankedDifferentiable::of_scalar(Scalar::zero()),
+            ],
+        );
+
+        assert_eq!(*loss.real_part(), 33.21);
+    }
+}
--- a/little_learner/src/not_nan.rs
+++ b/little_learner/src/not_nan.rs
@@ -0,0 +1,19 @@
+use ordered_float::NotNan;
+
+pub fn to_not_nan_1<T, const N: usize>(xs: [T; N]) -> [NotNan<T>; N]
+where
+    T: ordered_float::Float,
+{
+    xs.map(|x| NotNan::new(x).expect("not nan"))
+}
+
+pub fn from_not_nan_1<T, const N: usize>(xs: [NotNan<T>; N]) -> [T; N] {
+    xs.map(|x| x.into_inner())
+}
+
+pub fn to_not_nan_2<T, const N: usize, const M: usize>(xs: [[T; N]; M]) -> [[NotNan<T>; N]; M]
+where
+    T: ordered_float::Float,
+{
+    xs.map(to_not_nan_1)
+}
--- a/little_learner/src/predictor.rs
+++ b/little_learner/src/predictor.rs
@@ -0,0 +1,197 @@
+use crate::auto_diff::{Differentiable, DifferentiableTagged};
+use crate::scalar::Scalar;
+use crate::smooth::smooth;
+use crate::traits::NumLike;
+
+/// A Predictor is a function (`predict`) we're optimising, an `inflate` which adds any metadata
+/// that the prediction engine might require, a corresponding `deflate` which removes the metadata,
+/// and an `update` which computes the next guess based on the previous guess.
+pub struct Predictor<F, Inflated, Deflated, Params> {
+    /// The function we're trying to optimise.
+    pub predict: F,
+    /// Attach prediction metadata to an input to the function we're trying to optimise.
+    pub inflate: fn(Deflated) -> Inflated,
+    /// Remove prediction metadata.
+    pub deflate: fn(Inflated) -> Deflated,
+    /// Given a guess at an optimum, the gradient at that point, and any hyperparameters,
+    /// compute the next guess at the optimum.
+    pub update: fn(Inflated, &Deflated, Params) -> Inflated,
+}
+
+/// Hyperparameters applying to the most basic way to calculate the next step.
+#[derive(Clone)]
+pub struct NakedHypers<A> {
+    pub learning_rate: A,
+}
+
+pub const fn naked<F, A>(f: F) -> Predictor<F, Differentiable<A>, Differentiable<A>, NakedHypers<A>>
+where
+    A: NumLike,
+{
+    Predictor {
+        predict: f,
+        inflate: |x| x,
+        deflate: |x| x,
+
+        update: |theta, delta, hyper| {
+            let learning_rate = Scalar::make(hyper.learning_rate);
+            Differentiable::map2(&theta, delta, &mut |theta, delta| {
+                (theta.clone() - delta.clone() * learning_rate.clone()).truncate_dual(None)
+            })
+        },
+    }
+}
+
+#[derive(Clone)]
+pub struct RmsHyper<A> {
+    pub stabilizer: A,
+    pub beta: A,
+    pub learning_rate: A,
+}
+
+impl<A> RmsHyper<A> {
+    #[must_use]
+    pub fn with_stabilizer(self, s: A) -> RmsHyper<A> {
+        RmsHyper {
+            learning_rate: self.learning_rate,
+            beta: self.beta,
+            stabilizer: s,
+        }
+    }
+
+    #[must_use]
+    pub fn with_beta(self, s: A) -> RmsHyper<A> {
+        RmsHyper {
+            learning_rate: self.learning_rate,
+            beta: s,
+            stabilizer: self.stabilizer,
+        }
+    }
+}
+
+pub const fn rms<F, A>(
+    f: F,
+) -> Predictor<F, DifferentiableTagged<A, A>, Differentiable<A>, RmsHyper<A>>
+where
+    A: NumLike,
+{
+    Predictor {
+        predict: f,
+        inflate: |x| x.map_tag(&mut |()| A::zero()),
+        deflate: |x| x.map_tag(&mut |_| ()),
+        update: |theta, delta, hyper| {
+            DifferentiableTagged::map2_tagged(&theta, delta, &mut |theta, smoothed_r, delta, ()| {
+                let r = smooth(
+                    Scalar::make(hyper.beta.clone()),
+                    &Differentiable::of_scalar(Scalar::make(smoothed_r)),
+                    &Differentiable::of_scalar(delta.clone() * delta.clone()),
+                )
+                .into_scalar();
+                let learning_rate = hyper.learning_rate.clone()
+                    / (r.clone_real_part().sqrt() + hyper.stabilizer.clone());
+                (
+                    Scalar::make(
+                        theta.clone_real_part() + -(delta.clone_real_part() * learning_rate),
+                    ),
+                    r.clone_real_part(),
+                )
+            })
+        },
+    }
+}
+
+#[derive(Clone)]
+pub struct VelocityHypers<A> {
+    pub learning_rate: A,
+    pub mu: A,
+}
+
+pub const fn velocity<F, A>(
+    f: F,
+) -> Predictor<F, DifferentiableTagged<A, A>, Differentiable<A>, VelocityHypers<A>>
+where
+    A: NumLike,
+{
+    Predictor {
+        predict: f,
+        inflate: |x| x.map_tag(&mut |()| A::zero()),
+        deflate: |x| x.map_tag(&mut |_| ()),
+        update: |theta, delta, hyper| {
+            DifferentiableTagged::map2_tagged(&theta, delta, &mut |theta, velocity, delta, ()| {
+                let velocity = hyper.mu.clone() * velocity
+                    + -(delta.clone_real_part() * hyper.learning_rate.clone());
+                (theta.clone() + Scalar::make(velocity.clone()), velocity)
+            })
+        },
+    }
+}
+
+#[derive(Clone)]
+pub struct AdamHyper<A> {
+    pub rms: RmsHyper<A>,
+    pub mu: A,
+}
+
+impl<A> AdamHyper<A> {
+    #[must_use]
+    pub fn with_stabilizer(self, s: A) -> AdamHyper<A> {
+        AdamHyper {
+            mu: self.mu,
+            rms: self.rms.with_stabilizer(s),
+        }
+    }
+
+    #[must_use]
+    pub fn with_beta(self, s: A) -> AdamHyper<A> {
+        AdamHyper {
+            mu: self.mu,
+            rms: self.rms.with_beta(s),
+        }
+    }
+
+    #[must_use]
+    pub fn with_mu(self, mu: A) -> AdamHyper<A> {
+        AdamHyper { mu, rms: self.rms }
+    }
+}
+
+type AdamInflated<A> = DifferentiableTagged<A, (A, A)>;
+
+pub const fn adam<F, A>(f: F) -> Predictor<F, AdamInflated<A>, Differentiable<A>, AdamHyper<A>>
+where
+    A: NumLike,
+{
+    Predictor {
+        predict: f,
+        inflate: |x| x.map_tag(&mut |()| (A::zero(), A::zero())),
+        deflate: |x| x.map_tag(&mut |_| ()),
+        update: |theta, delta, hyper| {
+            DifferentiableTagged::map2_tagged(
+                &theta,
+                delta,
+                &mut |theta, (smoothed_velocity, smoothed_r), delta, ()| {
+                    let r = smooth(
+                        Scalar::make(hyper.rms.beta.clone()),
+                        &Differentiable::of_scalar(Scalar::make(smoothed_r)),
+                        &Differentiable::of_scalar(delta.clone() * delta.clone()),
+                    )
+                    .into_scalar();
+                    let learning_rate = hyper.rms.learning_rate.clone()
+                        / (r.clone_real_part().sqrt() + hyper.rms.stabilizer.clone());
+                    let velocity = smooth(
+                        Scalar::make(hyper.mu.clone()),
+                        &Differentiable::of_scalar(Scalar::make(smoothed_velocity)),
+                        &Differentiable::of_scalar(delta.clone()),
+                    )
+                    .into_scalar();
+                    (
+                        Scalar::make(
+                            theta.clone_real_part() + -(velocity.clone_real_part() * learning_rate),
+                        ),
+                        (velocity.clone_real_part(), r.clone_real_part()),
+                    )
+                },
+            )
+        },
+    }
+}
--- a/little_learner/src/sample.rs
+++ b/little_learner/src/sample.rs
@@ -0,0 +1,21 @@
+use rand::Rng;
+
+/// Grab `n` random samples from `from_x` and `from_y`, collecting them into a vector.
+pub fn take_2<R: Rng, T, U, I, J>(rng: &mut R, n: usize, from_x: I, from_y: J) -> (Vec<T>, Vec<U>)
+where
+    T: Copy,
+    U: Copy,
+    I: AsRef<[T]>,
+    J: AsRef<[U]>,
+{
+    let from_x = from_x.as_ref();
+    let from_y = from_y.as_ref();
+    let mut out_x = Vec::with_capacity(n);
+    let mut out_y = Vec::with_capacity(n);
+    for _ in 0..n {
+        let sample = rng.gen_range(0..from_x.len());
+        out_x.push(from_x[sample]);
+        out_y.push(from_y[sample]);
+    }
+    (out_x, out_y)
+}
--- a/little_learner/src/scalar.rs
+++ b/little_learner/src/scalar.rs
@@ -1,5 +1,6 @@
-use crate::traits::{Exp, One, Zero};
+use crate::traits::{Exp, One, Sqrt, Zero};
 use core::hash::Hash;
+use std::cmp::Ordering;
 use std::{
    collections::{hash_map::Entry, HashMap},
    fmt::Display,
@@ -14,6 +15,8 @@ pub enum LinkData<A> {
    Mul(Box<Scalar<A>>, Box<Scalar<A>>),
    Exponent(Box<Scalar<A>>),
    Log(Box<Scalar<A>>),
+    Div(Box<Scalar<A>>, Box<Scalar<A>>),
+    Sqrt(Box<Scalar<A>>),
 }

 #[derive(Clone, Hash, PartialEq, Eq, Debug)]
@@ -41,6 +44,10 @@ where
                f.write_fmt(format_args!("exp({})", arg.as_ref()))
            }
            Link::Link(LinkData::Log(arg)) => f.write_fmt(format_args!("log({})", arg.as_ref())),
+            Link::Link(LinkData::Sqrt(arg)) => f.write_fmt(format_args!("sqrt({})", arg.as_ref())),
+            Link::Link(LinkData::Div(left, right)) => {
+                f.write_fmt(format_args!("({} / {})", left.as_ref(), right.as_ref()))
+            }
        }
    }
 }
@@ -53,9 +60,11 @@ impl<A> Link<A> {
            + AddAssign
            + Clone
            + Exp
+            + Add<Output = A>
            + Mul<Output = A>
            + Div<Output = A>
            + Neg<Output = A>
+            + Sqrt
            + Zero
            + One,
    {
@@ -96,6 +105,21 @@ impl<A> Link<A> {
                            .clone_link()
                            .invoke(&right, left.clone_real_part() * z, acc);
                    }
+                    LinkData::Div(left, right) => {
+                        // d/dx(f / g) = f d(1/g)/dx + (df/dx) / g
+                        //             = -f (dg/dx)/g^2 + (df/dx) / g
+                        left.as_ref().clone_link().invoke(
+                            &left,
+                            z.clone() / right.clone_real_part(),
+                            acc,
+                        );
+                        right.as_ref().clone_link().invoke(
+                            &right,
+                            -left.clone_real_part() * z
+                                / (right.clone_real_part() * right.clone_real_part()),
+                            acc,
+                        );
+                    }
                    LinkData::Log(arg) => {
                        // d/dx(log y) = 1/y dy/dx
                        arg.as_ref().clone_link().invoke(
@@ -108,6 +132,15 @@ impl<A> Link<A> {
                        // d/dx(-y) = - dy/dx
                        arg.as_ref().clone_link().invoke(&arg, -z, acc);
                    }
+                    LinkData::Sqrt(arg) => {
+                        // d/dx(y^(1/2)) = 1/2 y^(-1/2) dy/dx
+                        let two = A::one() + A::one();
+                        arg.as_ref().clone_link().invoke(
+                            &arg,
+                            A::one() / (two * arg.as_ref().clone_real_part().sqrt()) * z,
+                            acc,
+                        );
+                    }
                }
            }
        }
@@ -144,6 +177,15 @@ where
    }
 }

+impl<A> AddAssign for Scalar<A>
+where
+    A: Add<Output = A> + Clone,
+{
+    fn add_assign(&mut self, rhs: Self) {
+        *self = self.clone() + rhs;
+    }
+}
+
 impl<A> Neg for Scalar<A>
 where
    A: Neg<Output = A> + Clone,
@@ -190,17 +232,72 @@ where
    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
        let mut answer = Zero::zero();
        for i in iter {
-            answer = answer + i;
+            answer += i;
        }
        answer
    }
 }

+impl<A> PartialOrd for Scalar<A>
+where
+    A: PartialOrd + Clone,
+{
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        self.real_part().partial_cmp(other.real_part())
+    }
+}
+
+impl<A> Exp for Scalar<A>
+where
+    A: Exp + Clone,
+{
+    fn exp(self) -> Self {
+        Self::Dual(
+            self.clone_real_part().exp(),
+            Link::Link(LinkData::Exponent(Box::new(self))),
+        )
+    }
+}
+
+impl<A> Div for Scalar<A>
+where
+    A: Div<Output = A> + Clone,
+{
+    type Output = Scalar<A>;
+
+    fn div(self, rhs: Self) -> Self::Output {
+        Self::Dual(
+            self.clone_real_part() / rhs.clone_real_part(),
+            Link::Link(LinkData::Div(Box::new(self), Box::new(rhs))),
+        )
+    }
+}
+
+impl<A> Sqrt for Scalar<A>
+where
+    A: Sqrt + Clone,
+{
+    fn sqrt(self) -> Self {
+        Self::Dual(
+            self.clone_real_part().sqrt(),
+            Link::Link(LinkData::Sqrt(Box::new(self))),
+        )
+    }
+}
+
+impl<A> Default for Scalar<A>
+where
+    A: Default,
+{
+    fn default() -> Self {
+        Scalar::Number(A::default(), None)
+    }
+}
+
 impl<A> Scalar<A> {
    pub fn real_part(&self) -> &A {
        match self {
-            Scalar::Number(a, _) => a,
-            Scalar::Dual(a, _) => a,
+            Scalar::Number(a, _) | Scalar::Dual(a, _) => a,
        }
    }

@@ -209,8 +306,7 @@ impl<A> Scalar<A> {
        A: Clone,
    {
        match self {
-            Scalar::Number(a, _) => (*a).clone(),
-            Scalar::Dual(a, _) => (*a).clone(),
+            Scalar::Number(a, _) | Scalar::Dual(a, _) => (*a).clone(),
        }
    }

@@ -231,6 +327,7 @@ impl<A> Scalar<A> {
        }
    }

+    #[must_use]
    pub fn truncate_dual(self, index: Option<usize>) -> Scalar<A>
    where
        A: Clone,
@@ -238,6 +335,7 @@ impl<A> Scalar<A> {
        Scalar::Dual(self.clone_real_part(), Link::EndOfLink(index))
    }

+    #[must_use]
    pub fn make(x: A) -> Scalar<A> {
        Scalar::Number(x, None)
    }
@@ -249,9 +347,60 @@ where
 {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
-            Scalar::Number(n, Some(index)) => f.write_fmt(format_args!("{}_{}", n, index)),
-            Scalar::Number(n, None) => f.write_fmt(format_args!("{}", n)),
-            Scalar::Dual(n, link) => f.write_fmt(format_args!("<{}, link: {}>", n, link)),
+            Scalar::Number(n, Some(index)) => f.write_fmt(format_args!("{n}_{index}")),
+            Scalar::Number(n, None) => f.write_fmt(format_args!("{n}")),
+            Scalar::Dual(n, link) => f.write_fmt(format_args!("<{n}, link: {link}>")),
        }
    }
 }
+
+#[cfg(test)]
+mod test_loss {
+    use crate::auto_diff::{grad, Differentiable, RankedDifferentiable};
+    use crate::scalar::Scalar;
+    use crate::traits::Sqrt;
+    use ordered_float::NotNan;
+    use std::collections::HashMap;
+
+    #[test]
+    fn div_gradient() {
+        let left = Scalar::make(NotNan::new(3.0).expect("not nan"));
+        let right = Scalar::make(NotNan::new(5.0).expect("not nan"));
+        let divided = left / right;
+        assert_eq!(divided.clone_real_part().into_inner(), 3.0 / 5.0);
+        let mut acc = HashMap::new();
+        divided
+            .clone_link()
+            .invoke(&divided, NotNan::new(1.0).expect("not nan"), &mut acc);
+
+        // Derivative of x/5 with respect to x is the constant 1/5
+        // Derivative of 3/x with respect to x is -3/x^2, so at the value 5 is -3/25
+        assert_eq!(acc.len(), 2);
+        for (key, value) in acc {
+            let key = key.real_part().into_inner();
+            let value = value.into_inner();
+            if key < 4.0 {
+                // This is the numerator.
+                assert_eq!(key, 3.0);
+                assert_eq!(value, 1.0 / 5.0);
+            } else {
+                // This is the denominator.
+                assert_eq!(key, 5.0);
+                assert_eq!(value, -3.0 / 25.0);
+            }
+        }
+    }
+
+    #[test]
+    fn sqrt_gradient() {
+        let nine = Differentiable::of_scalar(Scalar::make(NotNan::new(9.0).expect("not nan")));
+        let graded: [Differentiable<NotNan<f64>>; 1] = grad(
+            |x| RankedDifferentiable::of_scalar(x[0].clone().into_scalar().sqrt()),
+            &[nine],
+        );
+        let graded = graded.map(|x| x.into_scalar().clone_real_part().into_inner())[0];
+
+        // Derivative of sqrt(x) with respect to x at 3 is 1/6
+        assert_eq!(graded, 1.0 / 6.0);
+    }
+}
--- a/little_learner/src/smooth.rs
+++ b/little_learner/src/smooth.rs
@@ -0,0 +1,125 @@
+use crate::auto_diff::{Differentiable, DifferentiableTagged};
+use crate::scalar::Scalar;
+use crate::traits::One;
+use std::ops::{Add, Mul, Neg};
+
+/// Combine `old_value` and `new_value`, weighting the combination towards `new_value` by a factor
+/// of `decay`.
+pub fn smooth_tagged<A, F, Tag1, Tag2, Tag3>(
+    decay: Scalar<A>,
+    old_value: &DifferentiableTagged<A, Tag1>,
+    new_value: &DifferentiableTagged<A, Tag2>,
+    mut tags: F,
+) -> DifferentiableTagged<A, Tag3>
+where
+    A: One + Clone + Mul<Output = A> + Neg<Output = A> + Add<Output = A>,
+    F: FnMut(Tag1, Tag2) -> Tag3,
+    Tag1: Clone,
+    Tag2: Clone,
+{
+    DifferentiableTagged::map2_tagged(old_value, new_value, &mut |old, tag1, new, tag2| {
+        (
+            (old.clone() * decay.clone()) + (new.clone() * (Scalar::<A>::one() + -decay.clone())),
+            tags(tag1, tag2),
+        )
+    })
+}
+
+/// Combine `old_value` and `new_value`, weighting the combination towards `new_value` by a factor
+/// of `decay`.
+pub fn smooth<A>(
+    decay: Scalar<A>,
+    old_value: &Differentiable<A>,
+    new_value: &Differentiable<A>,
+) -> Differentiable<A>
+where
+    A: One + Clone + Mul<Output = A> + Neg<Output = A> + Add<Output = A>,
+{
+    smooth_tagged(decay, old_value, new_value, |(), ()| ())
+}
+
+#[cfg(test)]
+mod test_smooth {
+    use crate::auto_diff::Differentiable;
+    use crate::scalar::Scalar;
+    use crate::smooth::smooth;
+    use crate::traits::Zero;
+    use ordered_float::NotNan;
+
+    #[test]
+    fn one_dimension() {
+        let decay = Scalar::make(NotNan::new(0.9).expect("not nan"));
+        let smoothed = smooth(
+            decay.clone(),
+            &Differentiable::of_scalar(Scalar::<NotNan<f64>>::zero()),
+            &Differentiable::of_scalar(Scalar::make(NotNan::new(50.3).expect("not nan"))),
+        );
+        assert_eq!(
+            smoothed.into_scalar().real_part().into_inner(),
+            5.0299999999999985
+        );
+
+        let numbers = vec![50.3, 22.7, 4.3, 2.7, 1.8, 2.2, 0.6];
+        let mut output = Vec::with_capacity(numbers.len());
+        let mut acc = Scalar::<NotNan<f64>>::zero();
+        for number in numbers {
+            let number =
+                Differentiable::of_scalar(Scalar::make(NotNan::new(number).expect("not nan")));
+            let next = smooth(decay.clone(), &Differentiable::of_scalar(acc), &number);
+            output.push(next.clone().into_scalar().clone_real_part().into_inner());
+            acc = next.into_scalar();
+        }
+
+        // Note that the original sequence from the book has been heavily affected by rounding.
+        // By zero-indexed element 4, the sequence is different in the first significant digit!
+        assert_eq!(
+            output,
+            vec![
+                5.0299999999999985,
+                6.796_999_999_999_998,
+                6.547_299_999_999_998,
+                6.162_569_999_999_998,
+                5.7263129999999975,
+                5.373_681_699_999_998,
+                4.896_313_529_999_998
+            ]
+        );
+    }
+
+    fn hydrate(v: &[f64]) -> Differentiable<NotNan<f64>> {
+        Differentiable::of_vec(
+            v.iter()
+                .cloned()
+                .map(|v| Differentiable::of_scalar(Scalar::make(NotNan::new(v).expect("not nan"))))
+                .collect(),
+        )
+    }
+
+    #[test]
+    fn more_dimension() {
+        let decay = Scalar::make(NotNan::new(0.9).expect("not nan"));
+
+        let inputs = [
+            vec![1.0, 1.1, 3.0],
+            vec![13.4, 18.2, 41.4],
+            vec![1.1, 0.3, 67.3],
+        ]
+        .map(|x| hydrate(&x));
+
+        let mut current = hydrate(&[0.8, 3.1, 2.2]);
+        let mut output = Vec::with_capacity(inputs.len());
+        for input in inputs {
+            current = smooth(decay.clone(), &current, &input);
+            output.push(current.clone().attach_rank::<1>().unwrap().collect());
+        }
+
+        assert_eq!(
+            output,
+            vec![
+                vec![0.820_000_000_000_000_1, 2.9, 2.280_000_000_000_000_2],
+                vec![2.078, 4.43, 6.191_999_999_999_999],
+                vec![1.9802, 4.016_999_999_999_999_5, 12.302_799_999_999_998]
+            ]
+        );
+    }
+}
--- a/little_learner/src/tensor.rs
+++ b/little_learner/src/tensor.rs
@@ -1,107 +0,0 @@
-#[macro_export]
-macro_rules! tensor {
-    ($x:ty , $i: expr) => {[$x; $i]};
-    ($x:ty , $i: expr, $($is:expr),+) => {[tensor!($x, $($is),+); $i]};
-}
-
-#[cfg(test)]
-mod tests {
-    #[test]
-    fn test_tensor_type() {
-        let _: tensor!(f64, 1, 2, 3) = [[[1.0, 3.0, 6.0], [-1.3, -30.0, -0.0]]];
-    }
-}
-
-pub trait Extensible1<A> {
-    fn apply<F>(&self, other: &A, op: &F) -> Self
-    where
-        F: Fn(&A, &A) -> A;
-}
-
-pub trait Extensible2<A> {
-    fn apply<F>(&self, other: &Self, op: &F) -> Self
-    where
-        F: Fn(&A, &A) -> A;
-}
-
-impl<A, T, const N: usize> Extensible1<A> for [T; N]
-where
-    T: Extensible1<A> + Copy + Default,
-{
-    fn apply<F>(&self, other: &A, op: &F) -> Self
-    where
-        F: Fn(&A, &A) -> A,
-    {
-        let mut result = [Default::default(); N];
-        for (i, coord) in self.iter().enumerate() {
-            result[i] = T::apply(coord, other, op);
-        }
-        result
-    }
-}
-
-impl<A, T, const N: usize> Extensible2<A> for [T; N]
-where
-    T: Extensible2<A> + Copy + Default,
-{
-    fn apply<F>(&self, other: &Self, op: &F) -> Self
-    where
-        F: Fn(&A, &A) -> A,
-    {
-        let mut result = [Default::default(); N];
-        for (i, coord) in self.iter().enumerate() {
-            result[i] = T::apply(coord, &other[i], op);
-        }
-        result
-    }
-}
-
-#[macro_export]
-macro_rules! extensible1 {
-    ($x: ty) => {
-        impl Extensible1<$x> for $x {
-            fn apply<F>(&self, other: &$x, op: &F) -> Self
-            where
-                F: Fn(&Self, &Self) -> Self,
-            {
-                op(self, other)
-            }
-        }
-    };
-}
-
-#[macro_export]
-macro_rules! extensible2 {
-    ($x: ty) => {
-        impl Extensible2<$x> for $x {
-            fn apply<F>(&self, other: &Self, op: &F) -> Self
-            where
-                F: Fn(&Self, &Self) -> Self,
-            {
-                op(self, other)
-            }
-        }
-    };
-}
-
-extensible1!(u8);
-extensible1!(f64);
-
-extensible2!(u8);
-extensible2!(f64);
-
-pub fn extension1<T, A, F>(t1: &T, t2: &A, op: F) -> T
-where
-    T: Extensible1<A>,
-    F: Fn(&A, &A) -> A,
-{
-    t1.apply::<F>(t2, &op)
-}
-
-pub fn extension2<T, A, F>(t1: &T, t2: &T, op: F) -> T
-where
-    T: Extensible2<A>,
-    F: Fn(&A, &A) -> A,
-{
-    t1.apply::<F>(t2, &op)
-}
--- a/little_learner/src/traits.rs
+++ b/little_learner/src/traits.rs
@@ -1,6 +1,10 @@
+use crate::scalar::Scalar;
 use ordered_float::NotNan;
+use std::iter::Sum;
+use std::ops::{Add, AddAssign, Div, Mul, Neg};

 pub trait Exp {
+    #[must_use]
    fn exp(self) -> Self;
 }

@@ -10,11 +14,24 @@ impl Exp for NotNan<f64> {
    }
 }

+pub trait Sqrt {
+    #[must_use]
+    fn sqrt(self) -> Self;
+}
+
+impl Sqrt for NotNan<f64> {
+    fn sqrt(self) -> Self {
+        NotNan::new(f64::sqrt(self.into_inner())).expect("expected a non-NaN")
+    }
+}
+
 pub trait Zero {
+    #[must_use]
    fn zero() -> Self;
 }

 pub trait One {
+    #[must_use]
    fn one() -> Self;
 }

@@ -41,3 +58,25 @@ impl One for NotNan<f64> {
        NotNan::new(1.0).unwrap()
    }
 }
+
+pub trait NumLike:
+    One
+    + Zero
+    + Exp
+    + Add<Output = Self>
+    + AddAssign
+    + Neg<Output = Self>
+    + Mul<Output = Self>
+    + Div<Output = Self>
+    + Sum
+    + Sqrt
+    + Clone
+    + Sized
+    + PartialEq
+    + Eq
+{
+}
+
+impl NumLike for NotNan<f64> {}
+
+impl<A> NumLike for Scalar<A> where A: NumLike {}
--- a/little_learner_app/Cargo.toml
+++ b/little_learner_app/Cargo.toml
@@ -9,3 +9,5 @@ edition = "2021"
 immutable-chunkmap = "1.0.5"
 ordered-float = "3.6.0"
 little_learner = { path = "../little_learner" }
+rand = "0.8.5"
+csv = "1.2.2"
--- a/little_learner_app/src/iris.csv
+++ b/little_learner_app/src/iris.csv
@@ -0,0 +1,151 @@
+5.1,3.5,1.4,0.2,Iris-setosa
+4.9,3.0,1.4,0.2,Iris-setosa
+4.7,3.2,1.3,0.2,Iris-setosa
+4.6,3.1,1.5,0.2,Iris-setosa
+5.0,3.6,1.4,0.2,Iris-setosa
+5.4,3.9,1.7,0.4,Iris-setosa
+4.6,3.4,1.4,0.3,Iris-setosa
+5.0,3.4,1.5,0.2,Iris-setosa
+4.4,2.9,1.4,0.2,Iris-setosa
+4.9,3.1,1.5,0.1,Iris-setosa
+5.4,3.7,1.5,0.2,Iris-setosa
+4.8,3.4,1.6,0.2,Iris-setosa
+4.8,3.0,1.4,0.1,Iris-setosa
+4.3,3.0,1.1,0.1,Iris-setosa
+5.8,4.0,1.2,0.2,Iris-setosa
+5.7,4.4,1.5,0.4,Iris-setosa
+5.4,3.9,1.3,0.4,Iris-setosa
+5.1,3.5,1.4,0.3,Iris-setosa
+5.7,3.8,1.7,0.3,Iris-setosa
+5.1,3.8,1.5,0.3,Iris-setosa
+5.4,3.4,1.7,0.2,Iris-setosa
+5.1,3.7,1.5,0.4,Iris-setosa
+4.6,3.6,1.0,0.2,Iris-setosa
+5.1,3.3,1.7,0.5,Iris-setosa
+4.8,3.4,1.9,0.2,Iris-setosa
+5.0,3.0,1.6,0.2,Iris-setosa
+5.0,3.4,1.6,0.4,Iris-setosa
+5.2,3.5,1.5,0.2,Iris-setosa
+5.2,3.4,1.4,0.2,Iris-setosa
+4.7,3.2,1.6,0.2,Iris-setosa
+4.8,3.1,1.6,0.2,Iris-setosa
+5.4,3.4,1.5,0.4,Iris-setosa
+5.2,4.1,1.5,0.1,Iris-setosa
+5.5,4.2,1.4,0.2,Iris-setosa
+4.9,3.1,1.5,0.1,Iris-setosa
+5.0,3.2,1.2,0.2,Iris-setosa
+5.5,3.5,1.3,0.2,Iris-setosa
+4.9,3.1,1.5,0.1,Iris-setosa
+4.4,3.0,1.3,0.2,Iris-setosa
+5.1,3.4,1.5,0.2,Iris-setosa
+5.0,3.5,1.3,0.3,Iris-setosa
+4.5,2.3,1.3,0.3,Iris-setosa
+4.4,3.2,1.3,0.2,Iris-setosa
+5.0,3.5,1.6,0.6,Iris-setosa
+5.1,3.8,1.9,0.4,Iris-setosa
+4.8,3.0,1.4,0.3,Iris-setosa
+5.1,3.8,1.6,0.2,Iris-setosa
+4.6,3.2,1.4,0.2,Iris-setosa
+5.3,3.7,1.5,0.2,Iris-setosa
+5.0,3.3,1.4,0.2,Iris-setosa
+7.0,3.2,4.7,1.4,Iris-versicolor
+6.4,3.2,4.5,1.5,Iris-versicolor
+6.9,3.1,4.9,1.5,Iris-versicolor
+5.5,2.3,4.0,1.3,Iris-versicolor
+6.5,2.8,4.6,1.5,Iris-versicolor
+5.7,2.8,4.5,1.3,Iris-versicolor
+6.3,3.3,4.7,1.6,Iris-versicolor
+4.9,2.4,3.3,1.0,Iris-versicolor
+6.6,2.9,4.6,1.3,Iris-versicolor
+5.2,2.7,3.9,1.4,Iris-versicolor
+5.0,2.0,3.5,1.0,Iris-versicolor
+5.9,3.0,4.2,1.5,Iris-versicolor
+6.0,2.2,4.0,1.0,Iris-versicolor
+6.1,2.9,4.7,1.4,Iris-versicolor
+5.6,2.9,3.6,1.3,Iris-versicolor
+6.7,3.1,4.4,1.4,Iris-versicolor
+5.6,3.0,4.5,1.5,Iris-versicolor
+5.8,2.7,4.1,1.0,Iris-versicolor
+6.2,2.2,4.5,1.5,Iris-versicolor
+5.6,2.5,3.9,1.1,Iris-versicolor
+5.9,3.2,4.8,1.8,Iris-versicolor
+6.1,2.8,4.0,1.3,Iris-versicolor
+6.3,2.5,4.9,1.5,Iris-versicolor
+6.1,2.8,4.7,1.2,Iris-versicolor
+6.4,2.9,4.3,1.3,Iris-versicolor
+6.6,3.0,4.4,1.4,Iris-versicolor
+6.8,2.8,4.8,1.4,Iris-versicolor
+6.7,3.0,5.0,1.7,Iris-versicolor
+6.0,2.9,4.5,1.5,Iris-versicolor
+5.7,2.6,3.5,1.0,Iris-versicolor
+5.5,2.4,3.8,1.1,Iris-versicolor
+5.5,2.4,3.7,1.0,Iris-versicolor
+5.8,2.7,3.9,1.2,Iris-versicolor
+6.0,2.7,5.1,1.6,Iris-versicolor
+5.4,3.0,4.5,1.5,Iris-versicolor
+6.0,3.4,4.5,1.6,Iris-versicolor
+6.7,3.1,4.7,1.5,Iris-versicolor
+6.3,2.3,4.4,1.3,Iris-versicolor
+5.6,3.0,4.1,1.3,Iris-versicolor
+5.5,2.5,4.0,1.3,Iris-versicolor
+5.5,2.6,4.4,1.2,Iris-versicolor
+6.1,3.0,4.6,1.4,Iris-versicolor
+5.8,2.6,4.0,1.2,Iris-versicolor
+5.0,2.3,3.3,1.0,Iris-versicolor
+5.6,2.7,4.2,1.3,Iris-versicolor
+5.7,3.0,4.2,1.2,Iris-versicolor
+5.7,2.9,4.2,1.3,Iris-versicolor
+6.2,2.9,4.3,1.3,Iris-versicolor
+5.1,2.5,3.0,1.1,Iris-versicolor
+5.7,2.8,4.1,1.3,Iris-versicolor
+6.3,3.3,6.0,2.5,Iris-virginica
+5.8,2.7,5.1,1.9,Iris-virginica
+7.1,3.0,5.9,2.1,Iris-virginica
+6.3,2.9,5.6,1.8,Iris-virginica
+6.5,3.0,5.8,2.2,Iris-virginica
+7.6,3.0,6.6,2.1,Iris-virginica
+4.9,2.5,4.5,1.7,Iris-virginica
+7.3,2.9,6.3,1.8,Iris-virginica
+6.7,2.5,5.8,1.8,Iris-virginica
+7.2,3.6,6.1,2.5,Iris-virginica
+6.5,3.2,5.1,2.0,Iris-virginica
+6.4,2.7,5.3,1.9,Iris-virginica
+6.8,3.0,5.5,2.1,Iris-virginica
+5.7,2.5,5.0,2.0,Iris-virginica
+5.8,2.8,5.1,2.4,Iris-virginica
+6.4,3.2,5.3,2.3,Iris-virginica
+6.5,3.0,5.5,1.8,Iris-virginica
+7.7,3.8,6.7,2.2,Iris-virginica
+7.7,2.6,6.9,2.3,Iris-virginica
+6.0,2.2,5.0,1.5,Iris-virginica
+6.9,3.2,5.7,2.3,Iris-virginica
+5.6,2.8,4.9,2.0,Iris-virginica
+7.7,2.8,6.7,2.0,Iris-virginica
+6.3,2.7,4.9,1.8,Iris-virginica
+6.7,3.3,5.7,2.1,Iris-virginica
+7.2,3.2,6.0,1.8,Iris-virginica
+6.2,2.8,4.8,1.8,Iris-virginica
+6.1,3.0,4.9,1.8,Iris-virginica
+6.4,2.8,5.6,2.1,Iris-virginica
+7.2,3.0,5.8,1.6,Iris-virginica
+7.4,2.8,6.1,1.9,Iris-virginica
+7.9,3.8,6.4,2.0,Iris-virginica
+6.4,2.8,5.6,2.2,Iris-virginica
+6.3,2.8,5.1,1.5,Iris-virginica
+6.1,2.6,5.6,1.4,Iris-virginica
+7.7,3.0,6.1,2.3,Iris-virginica
+6.3,3.4,5.6,2.4,Iris-virginica
+6.4,3.1,5.5,1.8,Iris-virginica
+6.0,3.0,4.8,1.8,Iris-virginica
+6.9,3.1,5.4,2.1,Iris-virginica
+6.7,3.1,5.6,2.4,Iris-virginica
+6.9,3.1,5.1,2.3,Iris-virginica
+5.8,2.7,5.1,1.9,Iris-virginica
+6.8,3.2,5.9,2.3,Iris-virginica
+6.7,3.3,5.7,2.5,Iris-virginica
+6.7,3.0,5.2,2.3,Iris-virginica
+6.3,2.5,5.0,1.9,Iris-virginica
+6.5,3.0,5.2,2.0,Iris-virginica
+6.2,3.4,5.4,2.3,Iris-virginica
+5.9,3.0,5.1,1.8,Iris-virginica
+
--- a/little_learner_app/src/iris.rs
+++ b/little_learner_app/src/iris.rs
@@ -0,0 +1,110 @@
+use csv::ReaderBuilder;
+use little_learner::auto_diff::RankedDifferentiable;
+use little_learner::scalar::Scalar;
+use little_learner::traits::{One, Zero};
+use std::fmt::Debug;
+use std::io::Cursor;
+use std::str::FromStr;
+
+const IRIS_DATA: &str = include_str!("iris.csv");
+
+#[derive(Eq, PartialEq, Debug, Clone, Copy)]
+pub enum IrisType {
+    Setosa = 0,
+    Versicolor = 1,
+    Virginica = 2,
+}
+
+impl FromStr for IrisType {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "Iris-virginica" => Ok(IrisType::Virginica),
+            "Iris-versicolor" => Ok(IrisType::Versicolor),
+            "Iris-setosa" => Ok(IrisType::Setosa),
+            _ => Err(String::from(s)),
+        }
+    }
+}
+
+#[derive(PartialEq, Debug)]
+pub struct Iris<A> {
+    pub class: IrisType,
+    pub petal_length: A,
+    pub petal_width: A,
+    pub sepal_length: A,
+    pub sepal_width: A,
+}
+
+pub fn import<A, B>() -> Vec<Iris<A>>
+where
+    A: FromStr<Err = B>,
+    B: Debug,
+{
+    let mut reader = ReaderBuilder::new()
+        .has_headers(false)
+        .from_reader(Cursor::new(IRIS_DATA));
+    let mut output = Vec::new();
+    for record in reader.records() {
+        let record = record.unwrap();
+        let petal_length = A::from_str(&record[0]).unwrap();
+        let petal_width = A::from_str(&record[1]).unwrap();
+        let sepal_length = A::from_str(&record[2]).unwrap();
+        let sepal_width = A::from_str(&record[3]).unwrap();
+        let class = IrisType::from_str(&record[4]).unwrap();
+        output.push(Iris {
+            class,
+            petal_length,
+            petal_width,
+            sepal_length,
+            sepal_width,
+        });
+    }
+
+    output
+}
+
+impl<A> Iris<A> {
+    pub fn one_hot(&self) -> (RankedDifferentiable<A, 1>, RankedDifferentiable<A, 1>)
+    where
+        A: Clone + Zero + One,
+    {
+        let vec = vec![
+            RankedDifferentiable::of_scalar(Scalar::make(self.petal_length.clone())),
+            RankedDifferentiable::of_scalar(Scalar::make(self.petal_width.clone())),
+            RankedDifferentiable::of_scalar(Scalar::make(self.sepal_length.clone())),
+            RankedDifferentiable::of_scalar(Scalar::make(self.sepal_width.clone())),
+        ];
+
+        let mut one_hot = vec![A::zero(); 3];
+        one_hot[self.class as usize] = A::one();
+        let one_hot = one_hot
+            .iter()
+            .map(|x| RankedDifferentiable::of_scalar(Scalar::make(x.clone())))
+            .collect();
+        (
+            RankedDifferentiable::of_vector(vec),
+            RankedDifferentiable::of_vector(one_hot),
+        )
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::iris::{import, Iris, IrisType};
+
+    const EXPECTED_FIRST: Iris<f32> = Iris {
+        class: IrisType::Setosa,
+        petal_length: 5.1,
+        petal_width: 3.5,
+        sepal_length: 1.4,
+        sepal_width: 0.2,
+    };
+
+    #[test]
+    fn first_element() {
+        let irises = import();
+        assert_eq!(irises[0], EXPECTED_FIRST);
+    }
+}
--- a/little_learner_app/src/main.rs
+++ b/little_learner_app/src/main.rs
@@ -1,389 +1,27 @@
 #![allow(incomplete_features)]
 #![feature(generic_const_exprs)]

-mod with_tensor;
-
-use core::hash::Hash;
-use std::ops::{Add, AddAssign, Div, Mul, Neg};
-
-use little_learner::auto_diff::{grad, Differentiable, RankedDifferentiable};
-
-use little_learner::loss::{l2_loss_2, predict_plane};
-use little_learner::scalar::Scalar;
-use little_learner::traits::{Exp, One, Zero};
+use crate::rms_example::rms_example;
+use little_learner::auto_diff::RankedDifferentiable;
+use little_learner::block;
 use ordered_float::NotNan;

-fn iterate<A, F>(f: &F, start: A, n: u32) -> A
-where
-    F: Fn(A) -> A,
-{
-    let mut v = start;
-    for _ in 0..n {
-        v = f(v);
-    }
-    v
-}
-
-struct GradientDescentHyper<A> {
-    learning_rate: A,
-    iterations: u32,
-}
-
-fn gradient_descent_step<A, F, const RANK: usize, const PARAM_NUM: usize>(
-    f: &F,
-    theta: [Differentiable<A>; PARAM_NUM],
-    params: &GradientDescentHyper<A>,
-) -> [Differentiable<A>; PARAM_NUM]
-where
-    A: Clone
-        + Mul<Output = A>
-        + Neg<Output = A>
-        + Add<Output = A>
-        + Hash
-        + AddAssign
-        + Div<Output = A>
-        + Zero
-        + One
-        + Eq
-        + Exp,
-    F: Fn(&[Differentiable<A>; PARAM_NUM]) -> RankedDifferentiable<A, RANK>,
-{
-    let delta = grad(f, &theta);
-    let mut i = 0;
-    theta.map(|theta| {
-        let delta = &delta[i];
-        i += 1;
-        // For speed, you might want to truncate_dual this.
-        let learning_rate = Scalar::make((params.learning_rate).clone());
-        Differentiable::map2(
-            &theta,
-            &delta.map(&mut |s| s * learning_rate.clone()),
-            &|theta, delta| (*theta).clone() - (*delta).clone(),
-        )
-    })
-}
+mod iris;
+mod rms_example;

 fn main() {
-    let plane_xs = [
-        [1.0, 2.05],
-        [1.0, 3.0],
-        [2.0, 2.0],
-        [2.0, 3.91],
-        [3.0, 6.13],
-        [4.0, 8.09],
-    ];
-    let plane_ys = [13.99, 15.99, 18.0, 22.4, 30.2, 37.94];
+    rms_example();

-    let hyper = GradientDescentHyper {
-        learning_rate: NotNan::new(0.001).expect("not nan"),
-        iterations: 1000,
-    };
+    let irises = iris::import::<f64, _>();
+    let mut xs = Vec::with_capacity(irises.len());
+    let mut ys = Vec::with_capacity(irises.len());
+    for iris in irises {
+        let (x, y) = iris.one_hot();
+        xs.push(x);
+        ys.push(y);
+    }
+    let _xs = RankedDifferentiable::of_vector(xs);
+    let _ys = RankedDifferentiable::of_vector(ys);

-    let iterated = {
-        let xs = plane_xs.map(|x| {
-            [
-                NotNan::new(x[0]).expect("not nan"),
-                NotNan::new(x[1]).expect("not nan"),
-            ]
-        });
-        let ys = plane_ys.map(|x| NotNan::new(x).expect("not nan"));
-        iterate(
-            &|theta| {
-                gradient_descent_step(
-                    &|x| {
-                        RankedDifferentiable::of_vector(vec![RankedDifferentiable::of_scalar(
-                            l2_loss_2(
-                                predict_plane,
-                                RankedDifferentiable::of_slice_2::<_, 2>(&xs),
-                                RankedDifferentiable::of_slice(ys),
-                                x,
-                            ),
-                        )])
-                    },
-                    theta,
-                    &hyper,
-                )
-            },
-            [
-                RankedDifferentiable::of_slice([NotNan::zero(), NotNan::zero()]).to_unranked(),
-                Differentiable::of_scalar(Scalar::zero()),
-            ],
-            hyper.iterations,
-        )
-    };
-
-    let [theta0, theta1] = iterated;
-
-    let theta0 = theta0.attach_rank::<1>().expect("rank 1 tensor");
-    let theta1 = theta1.attach_rank::<0>().expect("rank 0 tensor");
-
-    assert_eq!(
-        theta0
-            .to_vector()
-            .into_iter()
-            .map(|x| x.to_scalar().real_part().into_inner())
-            .collect::<Vec<_>>(),
-        [3.97757644609063, 2.0496557321494446]
-    );
-    assert_eq!(
-        theta1.to_scalar().real_part().into_inner(),
-        5.786758464448078
-    );
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use little_learner::{
-        auto_diff::grad,
-        loss::{l2_loss_2, predict_line_2, predict_line_2_unranked, predict_quadratic_unranked},
-    };
-
-    use crate::with_tensor::{l2_loss, predict_line};
-
-    #[test]
-    fn loss_example() {
-        let xs = [2.0, 1.0, 4.0, 3.0];
-        let ys = [1.8, 1.2, 4.2, 3.3];
-        let loss = l2_loss_2(
-            predict_line_2,
-            RankedDifferentiable::of_slice(&xs),
-            RankedDifferentiable::of_slice(&ys),
-            &[
-                RankedDifferentiable::of_scalar(Scalar::zero()),
-                RankedDifferentiable::of_scalar(Scalar::zero()),
-            ],
-        );
-
-        assert_eq!(*loss.real_part(), 33.21);
-    }
-
-    #[test]
-    fn l2_loss_non_autodiff_example() {
-        let xs = [2.0, 1.0, 4.0, 3.0];
-        let ys = [1.8, 1.2, 4.2, 3.3];
-        let loss = l2_loss(predict_line, &xs, &ys, &[0.0099, 0.0]);
-        assert_eq!(loss, 32.5892403);
-    }
-
-    #[test]
-    fn grad_example() {
-        let input_vec = [Differentiable::of_scalar(Scalar::make(
-            NotNan::new(27.0).expect("not nan"),
-        ))];
-
-        let grad: Vec<_> = grad(
-            |x| {
-                RankedDifferentiable::of_scalar(
-                    x[0].borrow_scalar().clone() * x[0].borrow_scalar().clone(),
-                )
-            },
-            &input_vec,
-        )
-        .into_iter()
-        .map(|x| x.into_scalar().real_part().into_inner())
-        .collect();
-        assert_eq!(grad, [54.0]);
-    }
-
-    #[test]
-    fn loss_gradient() {
-        let zero = Scalar::<NotNan<f64>>::zero();
-        let input_vec = [
-            RankedDifferentiable::of_scalar(zero.clone()).to_unranked(),
-            RankedDifferentiable::of_scalar(zero).to_unranked(),
-        ];
-        let xs = [2.0, 1.0, 4.0, 3.0].map(|x| NotNan::new(x).expect("not nan"));
-        let ys = [1.8, 1.2, 4.2, 3.3].map(|x| NotNan::new(x).expect("not nan"));
-        let grad = grad(
-            |x| {
-                RankedDifferentiable::of_vector(vec![RankedDifferentiable::of_scalar(l2_loss_2(
-                    predict_line_2_unranked,
-                    RankedDifferentiable::of_slice(&xs),
-                    RankedDifferentiable::of_slice(&ys),
-                    x,
-                ))])
-            },
-            &input_vec,
-        );
-
-        assert_eq!(
-            grad.into_iter()
-                .map(|x| *(x.into_scalar().real_part()))
-                .collect::<Vec<_>>(),
-            [-63.0, -21.0]
-        );
-    }
-
-    #[test]
-    fn test_iterate() {
-        let f = |t: [i32; 3]| t.map(|i| i - 3);
-        assert_eq!(iterate(&f, [1, 2, 3], 5u32), [-14, -13, -12]);
-    }
-
-    #[test]
-    fn first_optimisation_test() {
-        let xs = [2.0, 1.0, 4.0, 3.0];
-        let ys = [1.8, 1.2, 4.2, 3.3];
-
-        let zero = Scalar::<NotNan<f64>>::zero();
-
-        let hyper = GradientDescentHyper {
-            learning_rate: NotNan::new(0.01).expect("not nan"),
-            iterations: 1000,
-        };
-        let iterated = {
-            let xs = xs.map(|x| NotNan::new(x).expect("not nan"));
-            let ys = ys.map(|x| NotNan::new(x).expect("not nan"));
-            iterate(
-                &|theta| {
-                    gradient_descent_step(
-                        &|x| {
-                            RankedDifferentiable::of_vector(vec![RankedDifferentiable::of_scalar(
-                                l2_loss_2(
-                                    predict_line_2_unranked,
-                                    RankedDifferentiable::of_slice(&xs),
-                                    RankedDifferentiable::of_slice(&ys),
-                                    x,
-                                ),
-                            )])
-                        },
-                        theta,
-                        &hyper,
-                    )
-                },
-                [
-                    RankedDifferentiable::of_scalar(zero.clone()).to_unranked(),
-                    RankedDifferentiable::of_scalar(zero).to_unranked(),
-                ],
-                hyper.iterations,
-            )
-        };
-        let iterated = iterated
-            .into_iter()
-            .map(|x| x.into_scalar().real_part().into_inner())
-            .collect::<Vec<_>>();
-
-        assert_eq!(iterated, vec![1.0499993623489503, 0.0000018747718457656533]);
-    }
-
-    #[test]
-    fn optimise_quadratic() {
-        let xs = [-1.0, 0.0, 1.0, 2.0, 3.0];
-        let ys = [2.55, 2.1, 4.35, 10.2, 18.25];
-
-        let zero = Scalar::<NotNan<f64>>::zero();
-
-        let hyper = GradientDescentHyper {
-            learning_rate: NotNan::new(0.001).expect("not nan"),
-            iterations: 1000,
-        };
-
-        let iterated = {
-            let xs = xs.map(|x| NotNan::new(x).expect("not nan"));
-            let ys = ys.map(|x| NotNan::new(x).expect("not nan"));
-            iterate(
-                &|theta| {
-                    gradient_descent_step(
-                        &|x| {
-                            RankedDifferentiable::of_vector(vec![RankedDifferentiable::of_scalar(
-                                l2_loss_2(
-                                    predict_quadratic_unranked,
-                                    RankedDifferentiable::of_slice(&xs),
-                                    RankedDifferentiable::of_slice(&ys),
-                                    x,
-                                ),
-                            )])
-                        },
-                        theta,
-                        &hyper,
-                    )
-                },
-                [
-                    RankedDifferentiable::of_scalar(zero.clone()).to_unranked(),
-                    RankedDifferentiable::of_scalar(zero.clone()).to_unranked(),
-                    RankedDifferentiable::of_scalar(zero).to_unranked(),
-                ],
-                hyper.iterations,
-            )
-        };
-        let iterated = iterated
-            .into_iter()
-            .map(|x| x.into_scalar().real_part().into_inner())
-            .collect::<Vec<_>>();
-
-        assert_eq!(
-            iterated,
-            [2.0546423148479684, 0.9928606519360353, 1.4787394427094362]
-        );
-    }
-
-    #[test]
-    fn optimise_plane() {
-        let plane_xs = [
-            [1.0, 2.05],
-            [1.0, 3.0],
-            [2.0, 2.0],
-            [2.0, 3.91],
-            [3.0, 6.13],
-            [4.0, 8.09],
-        ];
-        let plane_ys = [13.99, 15.99, 18.0, 22.4, 30.2, 37.94];
-
-        let hyper = GradientDescentHyper {
-            learning_rate: NotNan::new(0.001).expect("not nan"),
-            iterations: 1000,
-        };
-
-        let iterated = {
-            let xs = plane_xs.map(|x| {
-                [
-                    NotNan::new(x[0]).expect("not nan"),
-                    NotNan::new(x[1]).expect("not nan"),
-                ]
-            });
-            let ys = plane_ys.map(|x| NotNan::new(x).expect("not nan"));
-            iterate(
-                &|theta| {
-                    gradient_descent_step(
-                        &|x| {
-                            RankedDifferentiable::of_vector(vec![RankedDifferentiable::of_scalar(
-                                l2_loss_2(
-                                    predict_plane,
-                                    RankedDifferentiable::of_slice_2::<_, 2>(&xs),
-                                    RankedDifferentiable::of_slice(ys),
-                                    x,
-                                ),
-                            )])
-                        },
-                        theta,
-                        &hyper,
-                    )
-                },
-                [
-                    RankedDifferentiable::of_slice([NotNan::zero(), NotNan::zero()]).to_unranked(),
-                    Differentiable::of_scalar(Scalar::zero()),
-                ],
-                hyper.iterations,
-            )
-        };
-
-        let [theta0, theta1] = iterated;
-
-        let theta0 = theta0.attach_rank::<1>().expect("rank 1 tensor");
-        let theta1 = theta1.attach_rank::<0>().expect("rank 0 tensor");
-
-        assert_eq!(
-            theta0
-                .to_vector()
-                .into_iter()
-                .map(|x| x.to_scalar().real_part().into_inner())
-                .collect::<Vec<_>>(),
-            [3.97757644609063, 2.0496557321494446]
-        );
-        assert_eq!(
-            theta1.to_scalar().real_part().into_inner(),
-            5.786758464448078
-        );
-    }
+    let _network = block::compose(block::dense::<NotNan<f64>, ()>(6, 3), block::dense(4, 6), 2);
 }
--- a/little_learner_app/src/rms_example.rs
+++ b/little_learner_app/src/rms_example.rs
@@ -0,0 +1,65 @@
+use little_learner::auto_diff::{Differentiable, RankedDifferentiable, RankedDifferentiableTagged};
+
+use little_learner::gradient_descent::gradient_descent;
+use little_learner::hyper;
+use little_learner::loss::predict_plane;
+use little_learner::not_nan::{to_not_nan_1, to_not_nan_2};
+use little_learner::predictor;
+use little_learner::scalar::Scalar;
+use little_learner::traits::Zero;
+use ordered_float::NotNan;
+
+const PLANE_XS: [[f64; 2]; 6] = [
+    [1.0, 2.05],
+    [1.0, 3.0],
+    [2.0, 2.0],
+    [2.0, 3.91],
+    [3.0, 6.13],
+    [4.0, 8.09],
+];
+const PLANE_YS: [f64; 6] = [13.99, 15.99, 18.0, 22.4, 30.2, 37.94];
+
+pub(crate) fn rms_example() {
+    let beta = NotNan::new(0.9).expect("not nan");
+    let stabilizer = NotNan::new(0.000_000_01).expect("not nan");
+    let hyper = hyper::RmsGradientDescent::default(NotNan::new(0.01).expect("not nan"), 3000)
+        .with_stabilizer(stabilizer)
+        .with_beta(beta);
+
+    let iterated = {
+        let xs = to_not_nan_2(PLANE_XS);
+        let ys = to_not_nan_1(PLANE_YS);
+        let zero_params = [
+            RankedDifferentiable::of_slice(&[NotNan::<f64>::zero(), NotNan::<f64>::zero()])
+                .to_unranked(),
+            Differentiable::of_scalar(Scalar::zero()),
+        ];
+
+        gradient_descent(
+            hyper,
+            &xs,
+            RankedDifferentiableTagged::of_slice_2::<_, 2>,
+            &ys,
+            zero_params,
+            predictor::rms(predict_plane),
+            hyper::RmsGradientDescent::to_immutable,
+        )
+    };
+
+    let [theta0, theta1] = iterated;
+
+    let theta0 = theta0.attach_rank::<1>().expect("rank 1 tensor");
+    let theta1 = theta1.attach_rank::<0>().expect("rank 0 tensor");
+
+    let fitted_theta0 = theta0
+        .collect()
+        .iter()
+        .map(|x| x.into_inner())
+        .collect::<Vec<_>>();
+    let fitted_theta1 = theta1.to_scalar().real_part().into_inner();
+    assert_eq!(
+        fitted_theta0,
+        [3.974_645_444_172_085, 1.971_454_922_077_495]
+    );
+    assert_eq!(fitted_theta1, 6.164_579_048_274_036);
+}
--- a/little_learner_app/src/with_tensor.rs
+++ b/little_learner_app/src/with_tensor.rs
@@ -1,128 +0,0 @@
-#![allow(dead_code)]
-
-use std::iter::Sum;
-use std::ops::{Mul, Sub};
-
-use little_learner::tensor;
-use little_learner::tensor::{extension2, Extensible2};
-use little_learner::traits::One;
-
-type Point<A, const N: usize> = [A; N];
-
-type Parameters<A, const N: usize, const M: usize> = [Point<A, N>; M];
-
-fn dot_points<A: Mul, const N: usize>(x: &Point<A, N>, y: &Point<A, N>) -> A
-where
-    A: Sum<<A as Mul>::Output> + Copy + Default + Mul<Output = A> + Extensible2<A>,
-{
-    extension2(x, y, |&x, &y| x * y).into_iter().sum()
-}
-
-fn dot<A, const N: usize, const M: usize>(x: &Point<A, N>, y: &Parameters<A, N, M>) -> Point<A, M>
-where
-    A: Mul<Output = A> + Sum<<A as Mul>::Output> + Copy + Default + Extensible2<A>,
-{
-    let mut result = [Default::default(); M];
-    for (i, coord) in y.iter().map(|y| dot_points(x, y)).enumerate() {
-        result[i] = coord;
-    }
-    result
-}
-
-fn sum<A, const N: usize>(x: &tensor!(A, N)) -> A
-where
-    A: Sum<A> + Copy,
-{
-    A::sum(x.iter().cloned())
-}
-
-fn squared<A, const N: usize>(x: &tensor!(A, N)) -> tensor!(A, N)
-where
-    A: Mul<Output = A> + Extensible2<A> + Copy + Default,
-{
-    extension2(x, x, |&a, &b| (a * b))
-}
-
-fn l2_norm<A, const N: usize>(prediction: &tensor!(A, N), data: &tensor!(A, N)) -> A
-where
-    A: Sum<A> + Mul<Output = A> + Extensible2<A> + Copy + Default + Sub<Output = A>,
-{
-    let diff = extension2(prediction, data, |&x, &y| x - y);
-    sum(&squared(&diff))
-}
-
-pub fn l2_loss<A, F, Params, const N: usize>(
-    target: F,
-    data_xs: &tensor!(A, N),
-    data_ys: &tensor!(A, N),
-    params: &Params,
-) -> A
-where
-    F: Fn(&tensor!(A, N), &Params) -> tensor!(A, N),
-    A: Sum<A> + Mul<Output = A> + Extensible2<A> + Copy + Default + Sub<Output = A>,
-{
-    let pred_ys = target(data_xs, params);
-    l2_norm(&pred_ys, data_ys)
-}
-
-pub fn predict_line<A, const N: usize>(xs: &tensor!(A, N), theta: &tensor!(A, 2)) -> tensor!(A, N)
-where
-    A: Mul<Output = A> + Sum<<A as Mul>::Output> + Copy + Default + Extensible2<A> + One,
-{
-    let mut result: tensor!(A, N) = [Default::default(); N];
-    for (i, &x) in xs.iter().enumerate() {
-        result[i] = dot(&[x, One::one()], &[*theta])[0];
-    }
-    result
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use little_learner::tensor::extension1;
-
-    #[test]
-    fn test_extension() {
-        let x: tensor!(u8, 1) = [2];
-        assert_eq!(extension1(&x, &7, |x, y| x + y), [9]);
-        let y: tensor!(u8, 1) = [7];
-        assert_eq!(extension2(&x, &y, |x, y| x + y), [9]);
-
-        let x: tensor!(u8, 3) = [5, 6, 7];
-        assert_eq!(extension1(&x, &2, |x, y| x + y), [7, 8, 9]);
-        let y: tensor!(u8, 3) = [2, 0, 1];
-        assert_eq!(extension2(&x, &y, |x, y| x + y), [7, 6, 8]);
-
-        let x: tensor!(u8, 2, 3) = [[4, 6, 7], [2, 0, 1]];
-        assert_eq!(extension1(&x, &2, |x, y| x + y), [[6, 8, 9], [4, 2, 3]]);
-        let y: tensor!(u8, 2, 3) = [[1, 2, 2], [6, 3, 1]];
-        assert_eq!(extension2(&x, &y, |x, y| x + y), [[5, 8, 9], [8, 3, 2]]);
-    }
-
-    #[test]
-    fn test_l2_norm() {
-        assert_eq!(
-            l2_norm(&[4.0, -3.0, 0.0, -4.0, 3.0], &[0.0, 0.0, 0.0, 0.0, 0.0]),
-            50.0
-        )
-    }
-
-    #[test]
-    fn test_l2_loss() {
-        let loss = l2_loss(
-            predict_line,
-            &[2.0, 1.0, 4.0, 3.0],
-            &[1.8, 1.2, 4.2, 3.3],
-            &[0.0, 0.0],
-        );
-        assert_eq!(loss, 33.21);
-
-        let loss = l2_loss(
-            predict_line,
-            &[2.0, 1.0, 4.0, 3.0],
-            &[1.8, 1.2, 4.2, 3.3],
-            &[0.0099, 0.0],
-        );
-        assert_eq!((100.0 * loss).round() / 100.0, 32.59);
-    }
-}
Author	SHA1	Message	Date
Patrick Stevens	f873e5ca3d	Describe the network (#31 )	2023-06-17 23:03:32 +01:00
Patrick Stevens	bdb5d8e192	One-hot encoding (#30 )	2023-06-17 22:14:37 +01:00
Patrick Stevens	fd55cd1c5f	Iris data (#29 )	2023-06-17 19:03:01 +01:00
Patrick Stevens	095a8af7f2	k-relu (#28 )	2023-06-17 18:03:24 +01:00
Patrick Stevens	5bb1bddf83	Add ext form of relu (#27 )	2023-06-17 15:46:19 +01:00
Patrick Stevens	242f71fa75	Ext2 (#26 ) * Ext1 * ext1 tests * ext2	2023-06-14 15:58:26 +01:00
Patrick Stevens	6ab19d4c4d	Baby's first layer (#25 )	2023-06-14 14:16:56 +01:00
Patrick Stevens	1ee76d4bc3	Relu (#24 )	2023-05-08 14:43:27 +01:00
Patrick Stevens	fac93253f2	Fix the run (#23 )	2023-05-08 11:55:45 +01:00
Patrick Stevens	a0da79591a	Implement Adam, and fix RMS (#22 )	2023-05-08 10:39:42 +00:00
Patrick Stevens	deb0ec67ca	Add docs, delete old code, truncate scalars where possible (#21 )	2023-05-07 22:57:58 +00:00
Patrick Stevens	e42cfa22db	Move gradient descent to lib (#20 )	2023-05-07 21:49:25 +01:00
Patrick Stevens	87f191e479	Implement RMS (#19 )	2023-05-07 20:38:19 +01:00
Patrick Stevens	379bd1554a	Add badge to README and fix workflow (#18 )	2023-05-01 11:26:40 +00:00
Patrick Stevens	bbbacd421b	Smoothing (#17 )	2023-05-01 12:20:05 +01:00
Patrick Stevens	6dbd89aaac	Velocity descent (#16 )	2023-05-01 10:13:42 +00:00
Patrick Stevens	64d98757f4	Make Scalar numlike (#15 )	2023-04-30 12:09:16 +00:00
Patrick Stevens	ae6430aa85	Add infra for inflating and deflating (#14 )	2023-04-29 21:58:00 +01:00
Patrick Stevens	41977a726e	Generalise to gradient-descent function which can sample (#13 )	2023-04-29 19:18:05 +01:00