include("common.jl")
using KrylovKit, LinearAlgebra, CUDA, CUDA.CUTENSOR

@assert CUDA.functional() && CUDA.has_cuda() && CUDA.has_cuda_gpu() "CUDA not available"

"A Hamiltonian that can be applied to a vector"
struct HOperator{T}
    d::Int
    n::Int
    N::Int
    K_diag::CuTensor{Complex{T}}
    K_mixed::CuTensor{Complex{T}}
    Vs::CuArray{T}
    function HOperator{T}(V_twobody::Function, d::Int, n::Int, N::Int, L::T, μ::T, n_image::Int) where {T<:Float}
        k = -N÷2:N÷2-1
        K_partial = (im / sqrt(2 * μ)) .* ∂_1DOF.(L, N, k, k')
        K_diag = CuTensor(CuArray(K_partial * K_partial), ['a', 'A'])
        K_mixed = CuTensor(CuArray(K_partial), ['a', 'A']) * CuTensor(CuArray(K_partial), ['b', 'B'])
        Vs = calculate_Vs(V_twobody, d, n, N, L, n_image)
        return new{T}(d, n, N, K_diag, K_mixed, Vs)
    end
end

Base.size(H::HOperator, i::Int)::Int = (i == 1 || i == 2) ? H.N^(H.d * (H.n - 1)) : throw(ArgumentError("HOperator only has 2 dimesions"))
Base.size(H::HOperator)::Dims{2} = (size(H, 1), size(H, 2))

"Dimensions of a vector to which H can be applied"
vectorDims(H::HOperator)::Dims = tuple(fill(H.N, H.d * (H.n - 1))...)

"cuTENSOR contraction and accumulation (C = A * B + C)"
function contract_accumulate!(C::CuTensor, A::CuTensor, B::CuTensor)::CuTensor
    # https://docs.nvidia.com/cuda/cutensor/api/cutensor.html#cutensorcontraction
    compute_type = if eltype(C) == ComplexF32
        CUTENSOR.CUTENSOR_COMPUTE_TF32
    elseif eltype(C) == ComplexF64
        CUTENSOR.CUTENSOR_COMPUTE_64F
    else 
        eltype(C)
    end
    CUTENSOR.contraction!(one(eltype(C)), A.data, A.inds, CUTENSOR.CUTENSOR_OP_IDENTITY, B.data, B.inds, CUTENSOR.CUTENSOR_OP_IDENTITY,
                          one(eltype(C)), C.data, C.inds, CUTENSOR.CUTENSOR_OP_IDENTITY, CUTENSOR.CUTENSOR_OP_IDENTITY,
                          compute_type=compute_type)
    return C
end

"Apply H on v and store the result in out"
function LinearAlgebra.mul!(out::CuArray{Complex{T}}, H::HOperator{T}, v::CuArray{Complex{T}})::CuArray{Complex{T}} where {T<:Float}
    #LinearMaps.check_dim_mul(out,H,v) --- dimensions don't match
    ctx = context()
    # apply V operator
    NVTX.@range "V" @. out = H.Vs * v
    synchronize(ctx)
    # apply K opereator
    coords = H.n - 1
    inds_template = ('a' - 1) .+ collect(1:H.d*(coords))
    v_t = CuTensor(v, copy(inds_template))
    out_t = CuTensor(out, copy(inds_template))
    for dim = 1:H.d
        for coord1 = 1:coords
            for coord2 = 1:coord1
                i1 = which_index(H.n, dim, coord1)
                i2 = which_index(H.n, dim, coord2)
                @assert v_t.inds == inds_template "v indices permuted"
                if i1 == i2
                    @assert H.K_diag.inds[2] == 'A' "K_diag indices permuted"
                    H.K_diag.inds[1] = 'a' - 1 + i1
                    v_t.inds[i1] = 'A'
                    #synchronize(ctx)
                    NVTX.@range "K-diag" out_t = contract_accumulate!(out_t, H.K_diag, v_t)
                    v_t.inds[i1] = 'a' - 1 + i1
                else
                    @assert H.K_mixed.inds[2] == 'A' && H.K_mixed.inds[4] == 'B' "K_mixed indices permuted"
                    H.K_mixed.inds[1] = 'a' - 1 + i1
                    H.K_mixed.inds[3] = 'a' - 1 + i2
                    # OPTIMIZE: A and B can be swapped
                    v_t.inds[i1] = 'A'
                    v_t.inds[i2] = 'B'
                    #synchronize(ctx)
                    NVTX.@range "K-mixed" out_t = contract_accumulate!(out_t, H.K_mixed, v_t)
                    v_t.inds[i1] = 'a' - 1 + i1
                    v_t.inds[i2] = 'a' - 1 + i2
                end
            end
        end
    end
    @assert out_t.inds == inds_template "out indices permuted"
    synchronize(ctx)
    return out_t.data
end

"Apply H on v and return the result"
function (H::HOperator{T})(v::CuArray{Complex{T}})::CuArray{Complex{T}} where {T<:Float}
    out = similar(v)
    return mul!(out, H, v)
end

tolerance = 1e-6

"Wrapper for KrylovKit.eigsolve"
function eig(H::HOperator{T}, levels::Int)::Tuple{Vector{T},Any,Any} where {T<:Float}
    x₀ = CUDA.rand(Complex{T}, vectorDims(H)...) # ... added
    synchronize()
    evals, evecs, info = eigsolve(H, x₀, levels, :SR; ishermitian = true, tol = tolerance)
    info.converged < levels && throw(error("Not enough convergence"))
    return real.(evals), evecs, info
end