diff --git a/src/assemblers/Assemblers.jl b/src/assemblers/Assemblers.jl
index 9528edf..a6c8977 100644
--- a/src/assemblers/Assemblers.jl
+++ b/src/assemblers/Assemblers.jl
@@ -8,24 +8,98 @@ $(TYPEDSIGNATURES)
 """
 KA.get_backend(asm::AbstractAssembler) = KA.get_backend(asm.dof)
 
+abstract type AssembledReturnType end
+
+struct AssembledMatrix <: AssembledReturnType
+end
+
+struct AssembledScalar <: AssembledReturnType
+end
+
+struct AssembledStruct{T} <: AssembledReturnType
+end
+
+struct AssembledVector <: AssembledReturnType
+end
+
+# fall back method does nothing
+@inline function _accumulate_q_value(::AssembledReturnType, storage, val_q, val_e, q, e)
+  val_e = val_e + val_q
+  return val_e
+end
+
+@inline function _accumulate_q_value(::AssembledScalar, storage, val_q, val_e, q, e)
+  storage[q, e] = val_q
+  return nothing
+end
+
+@inline function _accumulate_q_value(::AssembledStruct, storage, val_q, val_e, q, e)
+  storage[q, e] = val_q
+  return nothing
+end
+
+# assembly helper methods below
+function _assemble_element!(
+  storage, R_el::SVector, 
+  conns, # all connectivities for this block
+  el_id::Int, ::Int, ::Int
+)
+  n_dofs = size(storage, 1)
+  for d in axes(storage, 1)
+    for n in axes(conns, 1)
+      global_id = n_dofs * (conns[n, el_id] - 1) + d
+      local_id = n_dofs * (n - 1) + d
+      Atomix.@atomic storage.data[global_id] += R_el[local_id]
+    end
+  end
+  return nothing
+end
+
+# has a pretty dumb name for what it does
+function _assemble_element!(
+  storage, val_q::T, 
+  conns, # all connectivities for this block
+  ::Int, ::Int, ::Int
+) where T <: Number
+  return nothing
+end
+
+# TODO we'll need a regular matrix implementation
+# as well (Can we live with 1?)
 """
 $(TYPEDSIGNATURES)
-Assembly method for an H1Field, e.g. internal force
-Called on a single element e for a given block b where local_val
-has already been constructed from quadrature contributions.
-"""
-function _assemble_element!(global_val::H1Field, local_val, conn)
-  n_dofs = size(global_val, 1)
-  for i in axes(conn, 1)
-    for d in 1:n_dofs
-      global_id = n_dofs * (conn[i] - 1) + d
-      local_id = n_dofs * (i - 1) + d
-      global_val[global_id] += local_val[local_id]
-    end
+Specialization of of ```_assemble_element!``` for ```SparseMatrixAssembler```.
+"""
+function _assemble_element!(
+  storage, K_el::SMatrix, 
+  conns, # all connectivities for this block
+  el_id::Int,
+  block_start_index::Int, block_el_level_size::Int
+)
+  # figure out ids needed to update
+  start_id = block_start_index + 
+             (el_id - 1) * block_el_level_size
+  end_id = start_id + block_el_level_size - 1
+  ids = start_id:end_id
+
+  # get appropriate storage and update values
+  # @views storage[ids] += K_el[:]
+  for (i, id) in enumerate(ids)
+    storage[id] += K_el.data[i]
   end
   return nothing
 end
 
+# has a pretty dumb name for what it does
+# fall back fro struct
+function _assemble_element!(
+  storage, val_q, 
+  conns, # all connectivities for this block
+  ::Int, ::Int, ::Int
+)
+  return nothing
+end
+
 """
 $(TYPEDSIGNATURES)
 """
@@ -102,6 +176,43 @@ $(TYPEDSIGNATURES)
   return R_el
 end
 
+"""
+$(TYPEDSIGNATURES)
+"""
+@inline function _element_scratch(::AssembledMatrix, ref_fe, U)
+  ND = size(U, 1)
+  NNPE = ReferenceFiniteElements.num_vertices(ref_fe)
+  NxNDof = NNPE * ND
+  K_el = zeros(SMatrix{NxNDof, NxNDof, eltype(U), NxNDof * NxNDof})
+  return K_el
+end
+
+"""
+$(TYPEDSIGNATURES)
+"""
+@inline function _element_scratch(::AssembledScalar, ref_fe, U)
+  s_el = zero(eltype(U))
+  return s_el
+end
+
+"""
+$(TYPEDSIGNATURES)
+"""
+@inline function _element_scratch(::AssembledStruct, ref_fe, U)
+  return nothing
+end
+
+"""
+$(TYPEDSIGNATURES)
+"""
+@inline function _element_scratch(::AssembledVector, ref_fe, U)
+  ND = size(U, 1)
+  NNPE = ReferenceFiniteElements.num_vertices(ref_fe)
+  NxNDof = NNPE * ND
+  R_el = zeros(SVector{NxNDof, eltype(U)})
+  return R_el
+end
+
 """
 $(TYPEDSIGNATURES)
 """
@@ -181,6 +292,143 @@ function stiffness(assembler::AbstractAssembler)
   return _stiffness(assembler, KA.get_backend(assembler))
 end
 
+# General assembler methods
+
+# CPU implementation
+"""
+$(TYPEDSIGNATURES)
+Assembly method for a block labelled as block_id. This is a CPU implementation
+with no threading.
+
+TODO add state variables and physics properties
+"""
+function _assemble_block!(
+  ::KA.CPU,
+  # field::AbstractArray{<:Number, 1}, 
+  field,
+  conns::Connectivity, block_start_index::Int, block_el_level_size::Int,
+  func::Function,
+  physics::AbstractPhysics, ref_fe::ReferenceFE,
+  X::AbstractField, t::T, dt::T,
+  U::Solution, U_old::Solution, 
+  state_old::S, state_new::S, props::AbstractArray,
+  return_type::R
+) where {
+  T        <: Number,
+  Solution <: AbstractField,
+  S,       #<: L2QuadratureField
+  R        <: AssembledReturnType
+}
+
+  for e in axes(conns, 2)
+    x_el = _element_level_fields_flat(X, ref_fe, conns, e)
+    u_el = _element_level_fields_flat(U, ref_fe, conns, e)
+    u_el_old = _element_level_fields_flat(U_old, ref_fe, conns, e)
+    props_el = _element_level_properties(props, e)
+    val_el = _element_scratch(return_type, ref_fe, U)
+
+    for q in 1:num_quadrature_points(ref_fe)
+      interps = _cell_interpolants(ref_fe, q)
+      state_old_q = _quadrature_level_state(state_old, q, e)
+      state_new_q = _quadrature_level_state(state_new, q, e)
+      val_q = func(physics, interps, x_el, t, dt, u_el, u_el_old, state_old_q, state_new_q, props_el)
+      val_el = _accumulate_q_value(return_type, field, val_q, val_el, q, e)
+    end
+    _assemble_element!(field, val_el, conns, e, block_start_index, block_el_level_size)
+  end
+  return nothing
+end
+
+# GPU implementation
+# COV_EXCL_START
+KA.@kernel function _assemble_block_kernel!(
+  # field::AbstractArray{<:Number, 1}, 
+  field,
+  conns::Connectivity, block_start_index::Int, block_el_level_size::Int,
+  func::Function,
+  physics::AbstractPhysics, ref_fe::ReferenceFE,
+  X::AbstractField, t::T, dt::T,
+  U::Solution, U_old::Solution, 
+  state_old::S, state_new::S, props::AbstractArray,
+  return_type::R
+) where {
+  T        <: Number,
+  Solution <: AbstractField,
+  S,       #<: L2QuadratureField
+  R        <: AssembledReturnType
+}
+  E = KA.@index(Global)
+
+  x_el = _element_level_fields_flat(X, ref_fe, conns, E)
+  u_el = _element_level_fields_flat(U, ref_fe, conns, E)
+  u_el_old = _element_level_fields_flat(U_old, ref_fe, conns, E)
+  props_el = _element_level_properties(props, E)
+  val_el = _element_scratch(return_type, ref_fe, U)
+  for q in 1:num_quadrature_points(ref_fe)
+    interps = _cell_interpolants(ref_fe, q)
+    state_old_q = _quadrature_level_state(state_old, q, E)
+    state_new_q = _quadrature_level_state(state_new, q, E)
+    val_q = func(physics, interps, x_el, t, dt, u_el, u_el_old, state_old_q, state_new_q, props_el)
+    val_el = _accumulate_q_value(return_type, field, val_q, val_el, q, E)
+  end
+
+  _assemble_element!(field, val_el, conns, E, block_start_index, block_el_level_size)
+end
+# COV_EXCL_STOP
+
+# method for kernel generation
+function _assemble_block!(
+  backend::KA.Backend, 
+  field, 
+  conns, block_start_index, block_el_level_size,
+  func,
+  physics, ref_fe,
+  X, t, dt, U, U_old, state_old, state_new, props,
+  return_type
+)
+  kernel! = _assemble_block_kernel!(backend)
+  kernel!(
+    field,
+    conns, block_start_index, block_el_level_size, 
+    func,
+    physics, ref_fe,
+    X, t, dt,
+    U, U_old, 
+    state_old, state_new, props,
+    return_type,
+    ndrange=size(conns, 2)
+  )
+  return nothing
+end
+
+function create_assembler_cache(
+  asm::AbstractAssembler, 
+  ::AssembledMatrix
+)
+  return zeros(length(asm.matrix_pattern.Is))
+end
+
+function create_assembler_cache(
+  asm::AbstractAssembler,
+  ::AssembledScalar
+)
+  vals = L2ElementField[]
+  fspace = function_space(asm.dof)
+  for (conn, ref_fe) in zip(values(fspace.elem_conns), values(fspace.ref_fes))
+    NQ = ReferenceFiniteElements.num_quadrature_points(ref_fe)
+    NE = size(conn, 2)
+    push!(vals, L2ElementField(zeros(Float64, NQ, NE)))
+  end
+  return NamedTuple{keys(fspace.elem_conns)}(tuple(vals...))
+end
+
+function create_assembler_cache(
+  asm::AbstractAssembler, 
+  ::AssembledVector
+)
+  return create_field(asm)
+end
+
 # some utilities
 include("Constraints.jl")
 include("SparsityPatterns.jl")
diff --git a/src/assemblers/Matrix.jl b/src/assemblers/Matrix.jl
index 3dfd5e8..03aaf15 100644
--- a/src/assemblers/Matrix.jl
+++ b/src/assemblers/Matrix.jl
@@ -25,10 +25,12 @@ function assemble_matrix!(
   storage, pattern, dof, func::F, Uu, p
 ) where F <: Function
   fill!(storage, zero(eltype(storage)))
+  backend = KA.get_backend(storage)
   fspace = function_space(dof)
   t = current_time(p.times)
   dt = time_step(p.times)
   _update_for_assembly!(p, dof, Uu)
+  return_type = AssembledMatrix()
   for (
     conns, block_start_index, block_el_level_size,
     block_physics, ref_fe, 
@@ -41,9 +43,7 @@ function assemble_matrix!(
     values(p.state_old), values(p.state_new),
     values(p.properties),
   )
-    # TODO re-enable back-end checking
-    backend = KA.get_backend(p.h1_field)
-    _assemble_block_matrix!(
+    _assemble_block!(
       backend,
       storage,
       conns, block_start_index, block_el_level_size,
@@ -51,130 +51,8 @@ function assemble_matrix!(
       block_physics, ref_fe,
       p.h1_coords, t, dt,
       p.h1_field, p.h1_field_old, 
-      state_old, state_new, props
+      state_old, state_new, props,
+      return_type
     )
   end
 end
-
-# CPU implementation
-
-"""
-$(TYPEDSIGNATURES)
-Assembly method for a block labelled as block_id. This is a CPU implementation
-with no threading.
-
-TODO add state variables and physics properties
-"""
-function _assemble_block_matrix!(
-  ::KA.CPU,
-  field::AbstractArray{<:Number, 1}, 
-  conns::Connectivity, block_start_index::Int, block_el_level_size::Int,
-  func::Function,
-  physics::AbstractPhysics, ref_fe::ReferenceFE,
-  X::AbstractField, t::T, dt::T,
-  U::Solution, U_old::Solution, 
-  state_old::S, state_new::S, props::AbstractArray
-) where {
-  T        <: Number,
-  Solution <: AbstractField,
-  S        #<: L2QuadratureField
-}
-
-  for e in axes(conns, 2)
-    x_el = _element_level_fields_flat(X, ref_fe, conns, e)
-    u_el = _element_level_fields_flat(U, ref_fe, conns, e)
-    u_el_old = _element_level_fields_flat(U_old, ref_fe, conns, e)
-    props_el = _element_level_properties(props, e)
-    K_el = _element_scratch_matrix(ref_fe, U)
-
-    for q in 1:num_quadrature_points(ref_fe)
-      interps = _cell_interpolants(ref_fe, q)
-      state_old_q = _quadrature_level_state(state_old, q, e)
-      state_new_q = _quadrature_level_state(state_new, q, e)
-      K_q = func(physics, interps, x_el, t, dt, u_el, u_el_old, state_old_q, state_new_q, props_el)
-      K_el = K_el + K_q
-    end
-    _assemble_element!(field, K_el, e, block_start_index, block_el_level_size)
-  end
-  return nothing
-end
-
-
-# GPU implementation
-# COV_EXCL_START
-KA.@kernel function _assemble_block_matrix_kernel!(
-  field::AbstractArray{<:Number, 1}, 
-  conns::Connectivity, block_start_index::Int, block_el_level_size::Int,
-  func::Function,
-  physics::AbstractPhysics, ref_fe::ReferenceFE,
-  X::AbstractField, t::T, dt::T,
-  U::Solution, U_old::Solution, 
-  state_old::S, state_new::S, props::AbstractArray
-) where {
-  T        <: Number,
-  Solution <: AbstractField,
-  S        #<: L2QuadratureField
-}
-  E = KA.@index(Global)
-
-  x_el = _element_level_fields_flat(X, ref_fe, conns, E)
-  u_el = _element_level_fields_flat(U, ref_fe, conns, E)
-  u_el_old = _element_level_fields_flat(U_old, ref_fe, conns, E)
-  props_el = _element_level_properties(props, E)
-  K_el = _element_scratch_matrix(ref_fe, U)
-  for q in 1:num_quadrature_points(ref_fe)
-    interps = _cell_interpolants(ref_fe, q)
-    state_old_q = _quadrature_level_state(state_old, q, E)
-    state_new_q = _quadrature_level_state(state_new, q, E)
-    K_q = func(physics, interps, x_el, t, dt, u_el, u_el_old, state_old_q, state_new_q, props_el)
-    K_el = K_el + K_q
-  end
-
-  # leaving here just in case
-  # block_start_index = values(pattern.block_start_indices)[block_id]
-  # block_el_level_size = values(pattern.block_el_level_sizes)[block_id]
-  # start_id = block_start_index + 
-  #            (E - 1) * block_el_level_size
-  # end_id = start_id + block_el_level_size - 1
-  # ids = start_id:end_id
-  # for (i, id) in enumerate(ids)
-  #   Atomix.@atomic field[id] += K_el.data[i]
-  # end
-  _assemble_element!(field, K_el, E, block_start_index, block_el_level_size)
-end
-# COV_EXCL_STOP
-
-"""
-$(TYPEDSIGNATURES)
-Assembly method for a block labelled as block_id. This is a GPU agnostic implementation
-using KernelAbstractions and Atomix for eliminating race conditions
-
-TODO add state variables and physics properties
-"""
-function _assemble_block_matrix!(
-  backend::KA.Backend,
-  field::AbstractArray{<:Number, 1}, 
-  conns::Connectivity, block_start_index::Int, block_el_level_size::Int,
-  func::Function,
-  physics::AbstractPhysics, ref_fe::ReferenceFE,
-  X::AbstractField, t::T, dt::T,
-  U::Solution, U_old::Solution, 
-  state_old::S, state_new::S, props::AbstractArray
-) where {
-  T        <: Number,
-  Solution <: AbstractField,
-  S        #<: L2QuadratureField
-}
-  kernel! = _assemble_block_matrix_kernel!(backend)
-  kernel!(
-    field,
-    conns, block_start_index, block_el_level_size, 
-    func,
-    physics, ref_fe,
-    X, t, dt,
-    U, U_old, 
-    state_old, state_new, props,
-    ndrange=size(conns, 2)
-  )
-  return nothing
-end
diff --git a/src/assemblers/MatrixAction.jl b/src/assemblers/MatrixAction.jl
index a75ce6c..a0882ff 100644
--- a/src/assemblers/MatrixAction.jl
+++ b/src/assemblers/MatrixAction.jl
@@ -19,10 +19,12 @@ function assemble_matrix_action!(
   storage, dof, func, Uu, Vu, p
 )
   fill!(storage, zero(eltype(storage)))
+  backend = KA.get_backend(storage)
   fspace = function_space(dof)
   t = current_time(p.times)
   Δt = time_step(p.times)
   _update_for_assembly!(p, dof, Uu, Vu)
+  return_type = AssembledVector()
   for (
     conns, 
     block_physics, ref_fe,
@@ -33,16 +35,16 @@ function assemble_matrix_action!(
     values(p.state_old), values(p.state_new),
     values(p.properties)
   )
-    backend = KA.get_backend(p.h1_field)
     _assemble_block_matrix_action!(
       backend,
       storage,
-      conns,
+      conns, 0, 0,
       func,
       block_physics, ref_fe,
       p.h1_coords, t, Δt,
       p.h1_field, p.h1_field_old, p.h1_hvp_scratch_field,
-      state_old, state_new, props
+      state_old, state_new, props,
+      return_type
     )
   end
 end
@@ -60,12 +62,13 @@ TODO improve typing of fields to ensure they mathc up in terms of function
 function _assemble_block_matrix_action!(
   ::KA.CPU,
   field::AbstractField, 
-  conns::Connectivity,
+  conns::Connectivity, b1, b2,
   func::Function,
   physics::AbstractPhysics, ref_fe::ReferenceFE,
   X::AbstractField, t::T, Δt::T,
   U::Solution, U_old::Solution, V::Solution,
-  state_old::S, state_new::S, props::AbstractArray
+  state_old::S, state_new::S, props::AbstractArray,
+  return_type
 ) where {
   T        <: Number,
   Solution <: AbstractField,
@@ -86,7 +89,9 @@ function _assemble_block_matrix_action!(
       K_el = K_el + K_q
     end
     Kv_el = K_el * v_el
-    @views _assemble_element!(field, Kv_el, conns[:, e])
+    # @views _assemble_element!(field, Kv_el, conns[:, e])
+    # zeros aren't need for this case of this method
+    @views _assemble_element!(field, Kv_el, conns, e, 0, 0)
   end
 end
 
@@ -101,12 +106,13 @@ TODO mark const fields
 # COV_EXCL_START
 KA.@kernel function _assemble_block_matrix_action_kernel!(
   field::AbstractField, 
-  conns::Connectivity,
+  conns::Connectivity, b1, b2,
   func::Function,
   physics::AbstractPhysics, ref_fe::ReferenceFE,
   X::AbstractField, t::T, Δt::T,
   U::Solution, U_old::Solution, V::Solution,
-  state_old::S, state_new::S, props::AbstractArray
+  state_old::S, state_new::S, props::AbstractArray,
+  return_type
 ) where {
   T        <: Number,
   Solution <: AbstractField,
@@ -130,14 +136,17 @@ KA.@kernel function _assemble_block_matrix_action_kernel!(
   Kv_el = K_el * v_el
 
   # now assemble atomically
-  n_dofs = size(field, 1)
-  for i in 1:size(conns, 1)
-    for d in 1:n_dofs
-      global_id = n_dofs * (conns[i, E] - 1) + d
-      local_id = n_dofs * (i - 1) + d
-      Atomix.@atomic field.vals[global_id] += Kv_el[local_id]
-    end
-  end
+  # n_dofs = size(field, 1)
+  # for i in 1:size(conns, 1)
+  #   for d in 1:n_dofs
+  #     global_id = n_dofs * (conns[i, E] - 1) + d
+  #     local_id = n_dofs * (i - 1) + d
+  #     Atomix.@atomic field.vals[global_id] += Kv_el[local_id]
+  #   end
+  # end
+  # zeros are there since that method needs ints for
+  # matrix case
+  _assemble_element!(field, Kv_el, conns, E, 0, 0)
 end
 # COV_EXCL_STOP
 
@@ -151,12 +160,13 @@ TODO add state variables and physics properties
 function _assemble_block_matrix_action!(
   backend::KA.Backend,
   field::AbstractField, 
-  conns::Connectivity,
+  conns::Connectivity, b1, b2,
   func::Function,
   physics::AbstractPhysics, ref_fe::ReferenceFE,
   X::AbstractField, t::T, Δt::T,
   U::Solution, U_old::Solution, V::Solution,
-  state_old::S, state_new::S, props::AbstractArray
+  state_old::S, state_new::S, props::AbstractArray,
+  return_type
 ) where {
   T        <: Number,
   Solution <: AbstractField,
@@ -165,12 +175,13 @@ function _assemble_block_matrix_action!(
   kernel! = _assemble_block_matrix_action_kernel!(backend)
   kernel!(
     field, 
-    conns,
+    conns, b1, b2,
     func,
     physics, ref_fe, 
     X, t, Δt,
     U, U_old, V, 
     state_old, state_new, props, t, Δt,
+    return_type,
     ndrange=size(conns, 2)
   )
   return nothing
diff --git a/src/assemblers/NeumannBC.jl b/src/assemblers/NeumannBC.jl
index b813c09..8dd9b10 100644
--- a/src/assemblers/NeumannBC.jl
+++ b/src/assemblers/NeumannBC.jl
@@ -22,9 +22,9 @@ function assemble_vector_neumann_bc!(
   # do not zero!
   # TODO should below 2 methods calls be assumed to have
   # been conducted previously?
+  backend = KA.get_backend(p.h1_field)
   _update_for_assembly!(p, dof, Uu)
   for bc in values(p.neumann_bcs.bc_caches)
-    backend = KA.get_backend(bc)
     _assemble_block_vector_neumann_bc!(
       backend, storage,
       p.h1_field, p.h1_coords,
@@ -62,7 +62,7 @@ function _assemble_block_vector_neumann_bc!(
       R_el = R_el + JxW * Nvec * f_val
     end
 
-    @views _assemble_element!(field, R_el, bc.surface_conns[:, e])
+    @views _assemble_element!(field, R_el, bc.surface_conns, e, 0, 0)
   end
 end
 
@@ -100,15 +100,7 @@ KA.@kernel function _assemble_block_vector_neumann_bc_kernel!(
     R_el = R_el + JxW * Nvec * f_val
   end
 
-  # now assemble atomically
-  n_dofs = size(field, 1)
-  for i in 1:size(bc.surface_conns, 1)
-    for d in 1:n_dofs
-      global_id = n_dofs * (bc.surface_conns[i, E] - 1) + d
-      local_id = n_dofs * (i - 1) + d
-      Atomix.@atomic field.data[global_id] += R_el[local_id]
-    end
-  end
+  _assemble_element!(field, R_el, bc.surface_conns, E, 0, 0)
 end
 # COV_EXCL_STOP
 
diff --git a/src/assemblers/QuadratureQuantity.jl b/src/assemblers/QuadratureQuantity.jl
index 8b1a3cd..fedc4cf 100644
--- a/src/assemblers/QuadratureQuantity.jl
+++ b/src/assemblers/QuadratureQuantity.jl
@@ -4,8 +4,11 @@ $(TYPEDSIGNATURES)
 function assemble_scalar!(
   assembler, func::F, Uu, p
 ) where F <: Function
+  # the nothing below is because
+  # there is no needed sparsity pattern
   assemble_quadrature_quantity!(
-    assembler.scalar_quadrature_storage, assembler.dof,
+    assembler.scalar_quadrature_storage, 
+    nothing, assembler.dof,
     func, Uu, p
   )
 end
@@ -14,13 +17,15 @@ end
 $(TYPEDSIGNATURES)
 """
 function assemble_quadrature_quantity!(
-  storage, dof,
-  func::F, Uu, p
+  storage, pattern, dof,
+  func::F, Uu, p, return_type = AssembledScalar()
 ) where F <: Function
+  backend = KA.get_backend(p.h1_field)
   fspace = function_space(dof)
   t = current_time(p.times)
   Δt = time_step(p.times)
   _update_for_assembly!(p, dof, Uu)
+  # return_type = AssembledScalar()
   for (
     block_storage,
     conns, 
@@ -33,129 +38,16 @@ function assemble_quadrature_quantity!(
     values(p.state_old), values(p.state_new),
     values(p.properties)
   )
-    backend = KA.get_backend(p.h1_field)
-    _assemble_block_quadrature_quantity!(
+    _assemble_block!(
       backend,
       block_storage,
-      conns,
+      conns, 0, 0,
       func,
       block_physics, ref_fe,
       p.h1_coords, t, Δt,
       p.h1_field, p.h1_field_old,
-      state_old, state_new, props
+      state_old, state_new, props,
+      return_type
     )
   end
 end
-
-# CPU Implementation
-
-"""
-$(TYPEDSIGNATURES)
-Assembly method for a block labelled as block_id. This is a CPU implementation
-with no threading.
-
-TODO improve typing of fields to ensure they mathc up in terms of function 
-  spaces
-"""
-function _assemble_block_quadrature_quantity!(
-  ::KA.CPU,
-  field::AbstractArray, 
-  conns::Connectivity,
-  func::Function,
-  physics::AbstractPhysics, ref_fe::ReferenceFE,
-  X::AbstractField, t::T, Δt::T,
-  U::Solution, U_old::Solution,
-  state_old::S, state_new::S, props::AbstractArray
-) where {
-  T        <: Number,
-  Solution <: AbstractField,
-  S        #<: L2QuadratureField
-}
-  for e in axes(conns, 2)
-    x_el = _element_level_fields_flat(X, ref_fe, conns, e)
-    u_el = _element_level_fields_flat(U, ref_fe, conns, e)
-    u_el_old = _element_level_fields_flat(U_old, ref_fe, conns, e)
-    props_el = _element_level_properties(props, e)
-
-    for q in 1:num_quadrature_points(ref_fe)
-      interps = _cell_interpolants(ref_fe, q)
-      state_old_q = _quadrature_level_state(state_old, q, e)
-      state_new_q = _quadrature_level_state(state_new, q, e)
-      e_q = func(physics, interps, x_el, t, Δt, u_el, u_el_old, state_old_q, state_new_q, props_el)
-      field[q, e] = e_q
-    end
-  end
-end
-
-"""
-$(TYPEDSIGNATURES)
-Kernel for residual block assembly
-
-TODO mark const fields
-"""
-# COV_EXCL_START
-KA.@kernel function _assemble_block_quadrature_quantity_kernel!(
-  field::AbstractArray, 
-  conns::Connectivity,
-  func::Function,
-  physics::AbstractPhysics, ref_fe::ReferenceFE,
-  X::AbstractField, t::T, Δt::T,
-  U::Solution, U_old::Solution,
-  state_old::S, state_new::S, props::AbstractArray
-) where {
-  T        <: Number,
-  Solution <: AbstractField,
-  S        #<: L2QuadratureField
-}
-  # Q, E = KA.@index(Global, NTuple)
-  E = KA.@index(Global)
-  x_el = _element_level_fields_flat(X, ref_fe, conns, E)
-  u_el = _element_level_fields_flat(U, ref_fe, conns, E)
-  u_el_old = _element_level_fields_flat(U_old, ref_fe, conns, E)
-  props_el = _element_level_properties(props, E)
-
-  KA.Extras.@unroll for q in 1:num_quadrature_points(ref_fe)
-    interps = _cell_interpolants(ref_fe, q)
-    state_old_q = _quadrature_level_state(state_old, q, E)
-    state_new_q = _quadrature_level_state(state_new, q, E)
-    e_q = func(physics, interps, x_el, t, Δt, u_el, u_el_old, state_old_q, state_new_q, props_el)
-    @inbounds field[q, E] = e_q
-  end
-end
-# COV_EXCL_STOP
-
-"""
-$(TYPEDSIGNATURES)
-Assembly method for a block labelled as block_id. This is a GPU agnostic implementation
-using KernelAbstractions and Atomix for eliminating race conditions
-
-TODO add state variables and physics properties
-"""
-function _assemble_block_quadrature_quantity!(
-  backend::KA.Backend,
-  field::AbstractArray, 
-  conns::Connectivity,
-  func::Function,
-  physics::AbstractPhysics, ref_fe::ReferenceFE,
-  X::AbstractField, t::T, Δt::T,
-  U::Solution, U_old::Solution,
-  state_old::S, state_new::S, props::AbstractArray
-) where {
-  T        <: Number,
-  Solution <: AbstractField,
-  S        #<: L2QuadratureField
-}
-  kernel! = _assemble_block_quadrature_quantity_kernel!(backend)
-  kernel!(
-    field, 
-    conns,
-    func,
-    physics, ref_fe, 
-    X, t, Δt,
-    U, U_old, 
-    state_old, state_new, props,
-    ndrange=size(field, 2)
-  )
-  # KA.synchronize(backend)
-  return nothing
-end
diff --git a/src/assemblers/SparsityPatterns.jl b/src/assemblers/SparsityPatterns.jl
index aceaa93..1eb9e6c 100644
--- a/src/assemblers/SparsityPatterns.jl
+++ b/src/assemblers/SparsityPatterns.jl
@@ -7,14 +7,11 @@ case where you want to eliminate fixed-dofs or not.
 """
 struct SparseMatrixPattern{
   I <: AbstractArray{Int, 1},
-  # B,
   R <: AbstractArray{Float64, 1}
 }
   Is::I
   Js::I
   unknown_dofs::I
-  # block_start_indices::B
-  # block_el_level_sizes::B
   block_start_indices::Vector{Int}
   block_el_level_sizes::Vector{Int}
   # cache arrays
@@ -61,11 +58,6 @@ function SparseMatrixPattern(dof::DofManager)
     block_el_level_sizes[n] = size(conn, 1)^2
   end
 
-  # convert to NamedTuples so it's easy to index
-  # block_syms = keys(fspace.ref_fes)
-  # block_start_indices = NamedTuple{block_syms}(tuple(block_start_indices)...)
-  # block_el_level_sizes = NamedTuple{block_syms}(tuple(block_el_level_sizes)...)
-
   # setup pre-allocated arrays based on number of entries found above
   Is = Vector{Int64}(undef, n_entries)
   Js = Vector{Int64}(undef, n_entries)
@@ -142,29 +134,6 @@ function SparseArrays.sparse!(pattern::SparseMatrixPattern, storage)
   )
 end
 
-"""
-$(TYPEDSIGNATURES)
-Specialization of of ```_assemble_element!``` for ```SparseMatrixAssembler```.
-"""
-function _assemble_element!(
-  storage, K_el::SMatrix, 
-  el_id::Int,
-  block_start_index::Int, block_el_level_size::Int
-)
-  # figure out ids needed to update
-  start_id = block_start_index + 
-             (el_id - 1) * block_el_level_size
-  end_id = start_id + block_el_level_size - 1
-  ids = start_id:end_id
-
-  # get appropriate storage and update values
-  # @views storage[ids] += K_el[:]
-  for (i, id) in enumerate(ids)
-    storage[id] += K_el.data[i]
-  end
-  return nothing
-end
-
 num_entries(s::SparseMatrixPattern) = length(s.Is)
 
 # NOTE this methods assumes that dof is up to date
diff --git a/src/assemblers/Vector.jl b/src/assemblers/Vector.jl
index d0914ac..2baf9fd 100644
--- a/src/assemblers/Vector.jl
+++ b/src/assemblers/Vector.jl
@@ -5,7 +5,8 @@ function assemble_vector!(
   assembler, func::F, Uu, p
 ) where F <: Function
   assemble_vector!(
-    assembler.residual_storage, assembler.dof,
+    assembler.residual_storage, 
+    assembler.vector_pattern, assembler.dof,
     func, Uu, p
   )
   return nothing
@@ -15,15 +16,17 @@ end
 $(TYPEDSIGNATURES)
 """
 function assemble_vector!(
-  storage, dof, func::F, Uu, p
+  storage, pattern, dof, func::F, Uu, p
 ) where F <: Function
   fill!(storage, zero(eltype(storage)))
+  backend = KA.get_backend(storage)
   fspace = function_space(dof)
   t = current_time(p.times)
   Δt = time_step(p.times)
   _update_for_assembly!(p, dof, Uu)
+  return_type = AssembledVector()
   for (
-    conns, 
+    conns,
     block_physics, ref_fe,
     state_old, state_new, props
   ) in zip(
@@ -32,146 +35,19 @@ function assemble_vector!(
     values(p.state_old), values(p.state_new),
     values(p.properties)
   )
-    backend = KA.get_backend(p.h1_field)
-    _assemble_block_vector!(
+    _assemble_block!(
       backend,
       storage,
-      conns,
+      conns, 0, 0, # NOTE these are never used for vectors
+      # TODO eventually we'll need them if we want to use sparse vectors
       func,
       block_physics, ref_fe,
       p.h1_coords, t, Δt,
       p.h1_field, p.h1_field_old,
-      state_old, state_new, props
+      state_old, state_new, props,
+      return_type
     )
   end
   
   return nothing
 end
-
-# CPU Implementation
-
-"""
-$(TYPEDSIGNATURES)
-Assembly method for a block labelled as block_id. This is a CPU implementation
-with no threading.
-
-TODO improve typing of fields to ensure they mathc up in terms of function 
-  spaces
-"""
-function _assemble_block_vector!(
-  ::KA.CPU,
-  field::AbstractField, 
-  conns::Connectivity,
-  func::Function,
-  physics::AbstractPhysics, ref_fe::ReferenceFE,
-  X::AbstractField, t::T, Δt::T,
-  U::Solution, U_old::Solution,
-  state_old::S, state_new::S, props::AbstractArray
-) where {
-  T        <: Number,
-  Solution <: AbstractField,
-  S        #<: L2QuadratureField
-}
-  for e in axes(conns, 2)
-    x_el = _element_level_fields_flat(X, ref_fe, conns, e)
-    u_el = _element_level_fields_flat(U, ref_fe, conns, e)
-    u_el_old = _element_level_fields_flat(U_old, ref_fe, conns, e)
-    props_el = _element_level_properties(props, e)
-    R_el = _element_scratch_vector(ref_fe, U)
-
-    for q in 1:num_quadrature_points(ref_fe)
-      interps = _cell_interpolants(ref_fe, q)
-      state_old_q = _quadrature_level_state(state_old, q, e)
-      state_new_q = _quadrature_level_state(state_new, q, e)
-      R_q = func(physics, interps, x_el, t, Δt, u_el, u_el_old, state_old_q, state_new_q, props_el)
-      R_el = R_el + R_q
-    end
-    
-    @views _assemble_element!(field, R_el, conns[:, e])
-  end
-end
-
-# GPU implementation
-
-"""
-$(TYPEDSIGNATURES)
-Kernel for residual block assembly
-
-TODO mark const fields
-"""
-# COV_EXCL_START
-KA.@kernel function _assemble_block_vector_kernel!(
-  field::AbstractField, 
-  conns::Connectivity,
-  func::Function,
-  physics::AbstractPhysics, ref_fe::ReferenceFE,
-  X::AbstractField, t::T, Δt::T,
-  U::Solution, U_old::Solution,
-  state_old::S, state_new::S, props::AbstractArray
-) where {
-  T        <: Number,
-  Solution <: AbstractField,
-  S        #<: L2QuadratureField
-}
-  E = KA.@index(Global)
-
-  x_el = _element_level_fields_flat(X, ref_fe, conns, E)
-  u_el = _element_level_fields_flat(U, ref_fe, conns, E)
-  u_el_old = _element_level_fields_flat(U_old, ref_fe, conns, E)
-  props_el = _element_level_properties(props, E)
-  R_el = _element_scratch_vector(ref_fe, U)
-
-  for q in 1:num_quadrature_points(ref_fe)
-    interps = _cell_interpolants(ref_fe, q)
-    state_old_q = _quadrature_level_state(state_old, q, E)
-    state_new_q = _quadrature_level_state(state_new, q, E)
-    R_q = func(physics, interps, x_el, t, Δt, u_el, u_el_old, state_old_q, state_new_q, props_el)
-    R_el = R_el + R_q
-  end
-
-  # now assemble atomically
-  n_dofs = size(field, 1)
-  for i in 1:size(conns, 1)
-    for d in 1:n_dofs
-      global_id = n_dofs * (conns[i, E] - 1) + d
-      local_id = n_dofs * (i - 1) + d
-      Atomix.@atomic field.data[global_id] += R_el[local_id]
-    end
-  end
-end
-# COV_EXCL_STOP
-
-"""
-$(TYPEDSIGNATURES)
-Assembly method for a block labelled as block_id. This is a GPU agnostic implementation
-using KernelAbstractions and Atomix for eliminating race conditions
-
-TODO add state variables and physics properties
-"""
-function _assemble_block_vector!(
-  backend::KA.Backend,
-  field::AbstractField, 
-  conns::Connectivity,
-  func::Function,
-  physics::AbstractPhysics, ref_fe::ReferenceFE,
-  X::AbstractField, t::T, Δt::T,
-  U::Solution, U_old::Solution,
-  state_old::S, state_new::S, props::AbstractArray
-) where {
-  T        <: Number,
-  Solution <: AbstractField,
-  S        #<: L2QuadratureField
-}
-  kernel! = _assemble_block_vector_kernel!(backend)
-  kernel!(
-    field, 
-    conns,
-    func,
-    physics, ref_fe, 
-    X, t, Δt,
-    U, U_old,
-    state_old, state_new, props,
-    ndrange=size(conns, 2)
-  )
-  return nothing
-end
diff --git a/src/bcs/PeriodicBCs.jl b/src/bcs/PeriodicBCs.jl
index 48b7ab4..a30b714 100644
--- a/src/bcs/PeriodicBCs.jl
+++ b/src/bcs/PeriodicBCs.jl
@@ -171,6 +171,33 @@ function _constraint_matrix(dof::DofManager, pbcs::PeriodicBCs)
     return sparse(Is, Js, Vs)
 end
 
+function _constraint_matrix_mask(dof_manager::DofManager, pbc::PeriodicBCs)
+    Is, Js, Vs = Int[], Int[], Float64[]
+    side_a_dofs = mapreduce(x -> x.side_a_dofs, vcat, pbc.bc_caches)
+    side_b_dofs = mapreduce(x -> x.side_b_dofs, vcat, pbc.bc_caches)
+    # @show length(side_a_dofs)
+    for dof in 1:length(dof_manager)
+        if dof in side_b_dofs
+            id = findfirst(isequal(dof), side_b_dofs)
+            if id > length(dof_manager)
+                @assert false
+            end
+            # push!(Is, dof)
+            # push!(Js, side_a_dofs[id])
+            # push!(Vs, 1.)
+
+            push!(Is, dof)
+            push!(Js, side_b_dofs[id])
+            push!(Vs, 1.)
+        else
+            push!(Is, dof)
+            push!(Js, dof)
+            push!(Vs, 1.)
+        end
+    end
+    return sparse(Is, Js, Vs)
+end
+
 function _create_constraint_field(dof::DofManager, pbcs::PeriodicBCs)
     n_constraints = mapreduce(x -> length(x.side_a_dofs), +, pbcs.bc_caches)
     return zeros(n_constraints)
diff --git a/test/poisson/TestPoissonPBCs.jl b/test/poisson/TestPoissonPBCs.jl
index 3e10847..4bf6393 100644
--- a/test/poisson/TestPoissonPBCs.jl
+++ b/test/poisson/TestPoissonPBCs.jl
@@ -46,7 +46,7 @@ function test_poisson_pbcs()
     for n in 1:10
         U = sol[1:nu]
         λ = sol[nu+1:end]
-        assemble_vector!(asm.residual_storage, asm.dof, residual, U, p)
+        assemble_vector!(asm.residual_storage, asm.vector_pattern, asm.dof, residual, U, p)
         R_u = residual(asm) + C' * λ
         R_c = C * U
         R = vcat(R_u, R_c)