diff --git a/CPU.jl b/CPU.jl
new file mode 100644
index 0000000..332cf19
--- /dev/null
+++ b/CPU.jl
@@ -0,0 +1,72 @@
+include("common.jl")
+using TensorOperations, KrylovKit, LinearAlgebra
+
+"A Hamiltonian that can be applied to a vector"
+struct HOperator{T}
+    d::Int
+    n::Int
+    N::Int
+    L::T
+    μ::T
+    ∂1::Matrix{Complex{T}}
+    Vs::Array{T}
+    function HOperator{T}(V_twobody::Function, d::Int, n::Int, N::Int, L::T, μ::T, n_image::Int) where {T<:Float}
+        k = -N÷2:N÷2-1
+        ∂1 = ∂_1DOF.(L, N, k, k')
+        Vs = calculate_Vs(V_twobody, d, n, N, L, n_image)
+        return new{T}(d, n, N, L, μ, ∂1, Vs)
+    end
+end
+
+Base.size(H::HOperator, i::Int)::Int = (i == 1 || i == 2) ? H.N^(H.d * (H.n - 1)) : throw(ArgumentError("HOperator only has 2 dimesions"))
+Base.size(H::HOperator)::Dims{2} = (size(H, 1), size(H, 2))
+
+"Dimensions of a vector to which H can be applied"
+vectorDims(H::HOperator)::Dims = tuple(fill(H.N, H.d * (H.n - 1))...)
+
+"Apply H on v and store the result in out"
+function LinearAlgebra.mul!(out::Array{Complex{T}}, H::HOperator{T}, v::Array{Complex{T}})::Array{Complex{T}} where {T<:Float}
+    #LinearMaps.check_dim_mul(out,H,v) --- dimensions don't match
+    # apply V operator
+    @. out = H.Vs * v
+    # apply K opereator
+    coeff = -1 / (2 * H.μ)
+    coords = H.n - 1
+    nconList_v_template = -collect(1:H.d*(coords))
+    for dim = 1:H.d
+        for coord1 = 1:coords
+            for coord2 = 1:coord1
+                i1 = which_index(H.n, dim, coord1)
+                i2 = which_index(H.n, dim, coord2)
+                nconList_1 = [-i1, 1]
+                nconList_2 = [-i2, 2]
+                nconList_v = copy(nconList_v_template)
+                if i1 == i2
+                    nconList_2[1] = 1
+                else
+                    nconList_v[i1] = 1
+                end
+                nconList_v[i2] = 2
+                v_new = @ncon((H.∂1, H.∂1, v), (nconList_1, nconList_2, nconList_v))
+                out = axpy!(coeff, v_new, out)
+            end
+        end
+    end
+    return out
+end
+
+"Apply H on v and return the result"
+function (H::HOperator{T})(v::Array{Complex{T}})::Array{Complex{T}} where {T<:Float}
+    out = similar(v)
+    return mul!(out, H, v)
+end
+
+tolerance = 1e-6
+
+"Wrapper for KrylovKit.eigsolve"
+function eig(H::HOperator{T}, levels::Int)::Tuple{Vector{T},Any,Any} where {T<:Float}
+    x₀ = rand(Complex{T}, vectorDims(H))
+    evals, evecs, info = eigsolve(H, x₀, levels, :SR; ishermitian = true, tol = tolerance)
+    info.converged < levels && throw(error("Not enough convergence"))
+    return real.(evals), evecs, info
+end
diff --git a/GPU.jl b/GPU.jl
new file mode 100644
index 0000000..8099f18
--- /dev/null
+++ b/GPU.jl
@@ -0,0 +1,106 @@
+include("common.jl")
+using KrylovKit, LinearAlgebra, CUDA, CUDA.CUTENSOR
+
+@assert CUDA.functional() && CUDA.has_cuda() && CUDA.has_cuda_gpu() "CUDA not available"
+
+"A Hamiltonian that can be applied to a vector"
+struct HOperator{T}
+    d::Int
+    n::Int
+    N::Int
+    K_diag::CuTensor{Complex{T}}
+    K_mixed::CuTensor{Complex{T}}
+    Vs::CuArray{T}
+    function HOperator{T}(V_twobody::Function, d::Int, n::Int, N::Int, L::T, μ::T, n_image::Int) where {T<:Float}
+        k = -N÷2:N÷2-1
+        K_partial = (im / sqrt(2 * μ)) .* ∂_1DOF.(L, N, k, k')
+        K_diag = CuTensor(CuArray(K_partial * K_partial), ['a', 'A'])
+        K_mixed = CuTensor(CuArray(K_partial), ['a', 'A']) * CuTensor(CuArray(K_partial), ['b', 'B'])
+        Vs = calculate_Vs(V_twobody, d, n, N, L, n_image)
+        return new{T}(d, n, N, K_diag, K_mixed, Vs)
+    end
+end
+
+Base.size(H::HOperator, i::Int)::Int = (i == 1 || i == 2) ? H.N^(H.d * (H.n - 1)) : throw(ArgumentError("HOperator only has 2 dimesions"))
+Base.size(H::HOperator)::Dims{2} = (size(H, 1), size(H, 2))
+
+"Dimensions of a vector to which H can be applied"
+vectorDims(H::HOperator)::Dims = tuple(fill(H.N, H.d * (H.n - 1))...)
+
+"cuTENSOR contraction and accumulation (C = A * B + C)"
+function contract_accumulate!(C::CuTensor, A::CuTensor, B::CuTensor)::CuTensor
+    # https://docs.nvidia.com/cuda/cutensor/api/cutensor.html#cutensorcontraction
+    compute_type = if eltype(C) == ComplexF32
+        CUTENSOR.CUTENSOR_COMPUTE_TF32
+    elseif eltype(C) == ComplexF64
+        CUTENSOR.CUTENSOR_COMPUTE_64F
+    else 
+        eltype(C)
+    end
+    CUTENSOR.contraction!(one(eltype(C)), A.data, A.inds, CUTENSOR.CUTENSOR_OP_IDENTITY, B.data, B.inds, CUTENSOR.CUTENSOR_OP_IDENTITY,
+                          one(eltype(C)), C.data, C.inds, CUTENSOR.CUTENSOR_OP_IDENTITY, CUTENSOR.CUTENSOR_OP_IDENTITY,
+                          compute_type=compute_type)
+    return C
+end
+
+"Apply H on v and store the result in out"
+function LinearAlgebra.mul!(out::CuArray{Complex{T}}, H::HOperator{T}, v::CuArray{Complex{T}})::CuArray{Complex{T}} where {T<:Float}
+    #LinearMaps.check_dim_mul(out,H,v) --- dimensions don't match
+    ctx = context()
+    # apply V operator
+    NVTX.@range "V" @. out = H.Vs * v
+    synchronize(ctx)
+    # apply K opereator
+    coords = H.n - 1
+    inds_template = ('a' - 1) .+ collect(1:H.d*(coords))
+    v_t = CuTensor(v, copy(inds_template))
+    out_t = CuTensor(out, copy(inds_template))
+    for dim = 1:H.d
+        for coord1 = 1:coords
+            for coord2 = 1:coord1
+                i1 = which_index(H.n, dim, coord1)
+                i2 = which_index(H.n, dim, coord2)
+                @assert v_t.inds == inds_template "v indices permuted"
+                if i1 == i2
+                    @assert H.K_diag.inds[2] == 'A' "K_diag indices permuted"
+                    H.K_diag.inds[1] = 'a' - 1 + i1
+                    v_t.inds[i1] = 'A'
+                    #synchronize(ctx)
+                    NVTX.@range "K-diag" out_t = contract_accumulate!(out_t, H.K_diag, v_t)
+                    v_t.inds[i1] = 'a' - 1 + i1
+                else
+                    @assert H.K_mixed.inds[2] == 'A' && H.K_mixed.inds[4] == 'B' "K_mixed indices permuted"
+                    H.K_mixed.inds[1] = 'a' - 1 + i1
+                    H.K_mixed.inds[3] = 'a' - 1 + i2
+                    # OPTIMIZE: A and B can be swapped
+                    v_t.inds[i1] = 'A'
+                    v_t.inds[i2] = 'B'
+                    #synchronize(ctx)
+                    NVTX.@range "K-mixed" out_t = contract_accumulate!(out_t, H.K_mixed, v_t)
+                    v_t.inds[i1] = 'a' - 1 + i1
+                    v_t.inds[i2] = 'a' - 1 + i2
+                end
+            end
+        end
+    end
+    @assert out_t.inds == inds_template "out indices permuted"
+    synchronize(ctx)
+    return out_t.data
+end
+
+"Apply H on v and return the result"
+function (H::HOperator{T})(v::CuArray{Complex{T}})::CuArray{Complex{T}} where {T<:Float}
+    out = similar(v)
+    return mul!(out, H, v)
+end
+
+tolerance = 1e-6
+
+"Wrapper for KrylovKit.eigsolve"
+function eig(H::HOperator{T}, levels::Int)::Tuple{Vector{T},Any,Any} where {T<:Float}
+    x₀ = CUDA.rand(Complex{T}, vectorDims(H)...) # ... added
+    synchronize()
+    evals, evecs, info = eigsolve(H, x₀, levels, :SR; ishermitian = true, tol = tolerance)
+    info.converged < levels && throw(error("Not enough convergence"))
+    return real.(evals), evecs, info
+end
diff --git a/benchmark.jl b/benchmark.jl
new file mode 100644
index 0000000..12c1b9f
--- /dev/null
+++ b/benchmark.jl
@@ -0,0 +1,44 @@
+using CUDA
+
+GPU_mode = !("CPU" in ARGS) && CUDA.functional() && CUDA.has_cuda() && CUDA.has_cuda_gpu()
+
+println("Running with ",Threads.nthreads()," thread(s)")
+
+if GPU_mode
+    include("GPU.jl")
+    println("Available GPUs:")
+    print("    ")
+    println.(name.(devices()))
+else
+    include("CPU.jl")
+end
+
+T=Float32
+
+function V_test(r2::T)::T
+    return -4*exp(-r2/4)
+end
+
+function apply1000times(H,v)
+    for i in 1:1000
+        v=H(v);
+    end
+end
+
+N=10
+n_image=1
+μ=0.5
+
+for L::T in 5.0:14.0
+    println("Constructing H operator...")
+    @time H=HOperator{T}(V_test,3,3,N,L,convert(T,μ),n_image)
+    println("Applying H 1000 times...")
+    if GPU_mode
+        v=CUDA.rand(Complex{T},vectorDims(H)...)
+        synchronize()
+        CUDA.@profile CUDA.@time apply1000times(H,v)
+    else
+        v=rand(Complex{T},vectorDims(H)...)
+        @time apply1000times(H,v)
+    end
+end
\ No newline at end of file
diff --git a/common.jl b/common.jl
new file mode 100644
index 0000000..d187f99
--- /dev/null
+++ b/common.jl
@@ -0,0 +1,47 @@
+Float = Union{Float32,Float64}
+
+"Eq (46): Partial derivative matrix element for 1 degree of freedom"
+function ∂_1DOF(L::T, N::Int, k::Int, l::Int)::Complex{T} where {T<:Float}
+    if k == l
+        return -im * (π / L)
+    else
+        return (π / L) * (-1)^(k - l) * exp(-im * π * (k - l) / N) / sin(π * (k - l) / N)
+    end
+end
+
+"Which index (dimension of the multidimensional array) corresponds to this dimension and coordinate?"
+which_index(n::Int, dim::Int, coord::Int)::Int = (dim - 1) * (n - 1) + coord
+
+"k value of the given degree of freedom at the corresponding index, with coord=0 always returning 0"
+get_k(n::Int, N::Int, i::CartesianIndex, dim::Int, coord::Int)::Int =
+    coord == 0 ? 0 : i[which_index(n, dim, coord)] - N ÷ 2 - 1
+
+"k value of the DOF at the specified cubic image"
+get_shifted_k(n::Int, N::Int, i::CartesianIndex, dim::Int, coord::Int, image::Vector{Int})::Int =
+    get_k(n, N, i, dim, coord) + N * image[dim]
+
+"Difference of k values between two particles"
+get_Δk(n::Int, N::Int, i::CartesianIndex, dim::Int, coord1::Int, coord2::Int, image::Vector{Int})::Int =
+    get_k(n, N, i, dim, coord1) - get_shifted_k(n, N, i, dim, coord2, image)
+
+"Calculate diagonal elements of the V matrix"
+function calculate_Vs(V_twobody::Function, d::Int, n::Int, N::Int, L::T, n_image::Int)::Array{T} where {T<:Float}
+    L²_over_N² = (L / N)^2
+    coords = n - 1
+    images = collect.(Iterators.product(fill(-n_image:n_image, d)...)) # TODO: Learn how to use tuples instead of vectors
+    Vs = zeros(T, fill(N, d * coords)...)
+    for image in images
+        Threads.@threads for i in CartesianIndices(Vs)
+            for coord1 in 1:coords
+                for coord2 in 0:coord1-1
+                    Δk² = 0
+                    for dim in 1:d
+                        Δk² += get_Δk(n, N, i, dim, coord1, coord2, image)^2
+                    end
+                    Vs[i] += V_twobody(Δk² * L²_over_N²)
+                end
+            end
+        end
+    end
+    return Vs
+end
diff --git a/tester.ipynb b/tester.ipynb
new file mode 100644
index 0000000..72186f4
--- /dev/null
+++ b/tester.ipynb
@@ -0,0 +1,124 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  2.748577 seconds (5.37 M allocations: 1.547 GiB, 2.72% gc time, 51.73% compilation time)\n",
+      "318 operations : Float32[-6.121623, -4.333925, -4.3339176, -4.3331146, -4.333114, -4.2186065, -3.8715818, -3.87158, -3.5070298, -3.5068974]\n",
+      "  0.831048 seconds (534.99 k allocations: 876.908 MiB, 3.50% gc time)\n",
+      "210 operations : Float32[-5.682354, -3.9181006, -3.9180284, -3.9128082, -3.9127624, -3.2553062]\n",
+      "  0.951621 seconds (619.15 k allocations: 1021.256 MiB, 3.53% gc time)\n",
+      "243 operations : Float32[-5.6661367, -3.8267198, -3.826183, -3.8261826, -3.8142636, -3.813859, -3.813859, -2.9851913, -2.6131256]\n",
+      "  0.521213 seconds (351.49 k allocations: 572.464 MiB, 3.02% gc time)\n",
+      "138 operations : Float32[-5.7960987, -3.7636561, -3.7615638, -3.7473733, -3.7458477, -3.114099]\n",
+      "  0.991049 seconds (499.35 k allocations: 820.918 MiB, 19.23% gc time)\n",
+      "196 operations : Float32[-6.098429, -3.7029655, -3.6973379, -3.6973376, -3.6882992, -3.6845326, -3.6845293, -3.328601]\n",
+      "  0.760867 seconds (496.83 k allocations: 821.141 MiB, 3.50% gc time)\n",
+      "195 operations : Float32[-6.557852, -3.6505358, -3.639437, -3.6385512, -3.6385508, -3.6325362, -3.6325336, -3.464574, -3.0474584, -3.0474575, -2.8103795, -2.5698478, -2.566689]\n",
+      "  0.475876 seconds (320.90 k allocations: 521.722 MiB, 3.28% gc time)\n",
+      "126 operations : Float32[-7.090806, -3.6026099, -3.59206, -3.5818431, -3.5804336, -3.507454]\n",
+      "  0.726440 seconds (435.62 k allocations: 713.962 MiB, 3.45% gc time)\n",
+      "171 operations : Float32[-7.620871, -3.5517163, -3.5371916, -3.5240357, -3.524035]\n",
+      "  0.429791 seconds (290.32 k allocations: 471.693 MiB, 3.72% gc time)\n",
+      "114 operations : Float32[-8.108637, -3.4919598, -3.473435, -3.4578817, -3.443922, -3.4325478, -3.191059]\n",
+      "  0.702898 seconds (458.64 k allocations: 758.966 MiB, 3.41% gc time)\n",
+      "180 operations : Float32[-8.540969, -3.4251013, -3.4054415, -3.3879645, -3.3879626, -3.3685954, -3.3685944, -3.3663127, -3.167204, -3.1672013, -3.0144126]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ./En.run -d 3 -n 3 -e 5 -c eps=0 -c pot=v_gauss,v0=-4,r=2 -N 6 -L 5:14 -c n_imag=1\n",
+    "\n",
+    "include(\"CPU.jl\")\n",
+    "\n",
+    "T=Float32\n",
+    "\n",
+    "function V_test(r2::T)::T\n",
+    "    return -4*exp(-r2/4)\n",
+    "end\n",
+    "\n",
+    "N=6\n",
+    "for L::T in 5.0:14.0\n",
+    "    H=HOperator{T}(V_test,3,3,N,L,0.5f0,1)\n",
+    "    @time evals,evecs,info=eig(H,5)\n",
+    "    print(info.numops,\" operations : \")\n",
+    "    println(evals)\n",
+    "end"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " 14.448286 seconds (50.73 M allocations: 3.915 GiB, 8.09% gc time, 88.73% compilation time)\n",
+      "537 operations : [-1.6584965779926615, 0.09641386689178666, 0.09723973976478092, 0.5328665170436317, 1.8025648869642366]\n",
+      "  1.604373 seconds (401.60 k allocations: 1.508 GiB, 3.30% gc time)\n",
+      "553 operations : [-1.5449615189600243, -0.09575302046477274, -0.03260724659630898, 0.4296397924712101, 1.2210175901367406]\n",
+      "  2.218717 seconds (511.34 k allocations: 1.925 GiB, 10.54% gc time)\n",
+      "704 operations : [-1.5104831395040967, -0.18248248011755844, -0.1824824801175365, -0.0513106144023292, 0.38929600143076387]\n",
+      "  1.405123 seconds (341.32 k allocations: 1.281 GiB, 3.24% gc time)\n",
+      "470 operations : [-1.5012367056346252, -0.22279714610399948, -0.04336905464703111, 0.3604715853445004, 0.6888462772294086]\n",
+      "  1.210149 seconds (316.61 k allocations: 1.186 GiB, 2.94% gc time)\n",
+      "436 operations : [-1.4988400995145905, -0.24196385384291874, -0.032981565737151275, 0.33103969539525147, 0.5259297115623096]\n",
+      "  1.332379 seconds (315.88 k allocations: 1.185 GiB, 9.01% gc time)\n",
+      "435 operations : [-1.4982182159585573, -0.2512983034534455, -0.02487547395190237, 0.29998450185680947, 0.4122309417963048]\n",
+      "  1.195907 seconds (314.47 k allocations: 1.182 GiB, 3.37% gc time)\n",
+      "433 operations : [-1.4980550726371076, -0.25596036323828786, -0.019096799768268946, 0.26880364248084865, 0.3307745735815646]\n",
+      "  1.715641 seconds (424.90 k allocations: 1.603 GiB, 2.77% gc time)\n",
+      "585 operations : [-1.49801176610691, -0.25834484110826805, -0.2583448411082359, -0.014990294007072302, 0.23909670164137475, 0.2709248751032005, 0.3173198198351823, 0.4388941435702842]\n",
+      "  1.627872 seconds (391.47 k allocations: 1.476 GiB, 7.94% gc time)\n",
+      "539 operations : [-1.4980001524634678, -0.2595900399451015, -0.2595900399450435, -0.012010461572629527, 0.21193520997052917, 0.22589198387830156, 0.2668519716084061, 0.39403726730136146]\n",
+      "  1.503319 seconds (383.48 k allocations: 1.443 GiB, 3.06% gc time)\n",
+      "528 operations : [-1.4979970118272345, -0.2602517135112987, -0.26025171351127213, -0.009793271721286467, 0.1877986939216265, 0.19126207990343963, 0.22720822569865662]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ./En.run -d 3 -n 2 -e 5 -c eps=0 -c pot=v_gauss,v0=-4,r=2 -N 32 -L 5:14 -c n_imag=0\n",
+    "\n",
+    "include(\"CPU.jl\")\n",
+    "\n",
+    "T=Float64\n",
+    "\n",
+    "function V_test(r2::T)::T\n",
+    "    return -4*exp(-r2/4)\n",
+    "end\n",
+    "\n",
+    "N=32\n",
+    "for L::T in 5.0:14.0\n",
+    "    H=HOperator{T}(V_test,3,2,N,L)\n",
+    "    @time evals,evecs,info=eig(H,5)\n",
+    "    print(info.numops,\" operations : \")\n",
+    "    println(evals)\n",
+    "end"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Julia 1.7.1",
+   "language": "julia",
+   "name": "julia-1.7"
+  },
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "1.7.1"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tester.jl b/tester.jl
new file mode 100644
index 0000000..041d2cf
--- /dev/null
+++ b/tester.jl
@@ -0,0 +1,41 @@
+using CUDA
+
+GPU_mode = !("CPU" in ARGS) && CUDA.functional() && CUDA.has_cuda() && CUDA.has_cuda_gpu()
+
+println("Running with ",Threads.nthreads()," thread(s)")
+
+if GPU_mode
+    include("GPU.jl")
+    println("Available GPUs:")
+    print("    ")
+    println.(name.(devices()))
+else
+    include("CPU.jl")
+end
+
+T=Float32
+
+function V_zero(r2::T)::T
+    return 0.0
+end
+
+function V_test(r2::T)::T
+    return -4*exp(-r2/4)
+end
+
+N=6
+n_image=1
+μ=0.5
+
+levels=5
+
+for L::T in 5.0:14.0
+    H=HOperator{T}(V_test,3,3,N,L,convert(T,μ),n_image)
+    if GPU_mode
+        CUDA.@time evals,evecs,info=eig(H,levels)
+    else
+        @time evals,evecs,info=eig(H,levels)
+    end
+    print(info.numops," operations : ")
+    println(evals)
+end
\ No newline at end of file