diff --git a/py/redrock/archetypes.py b/py/redrock/archetypes.py
index e9943b4..aa32eae 100644
--- a/py/redrock/archetypes.py
+++ b/py/redrock/archetypes.py
@@ -69,6 +69,8 @@ def __init__(self, filename):
 
         # TODO: Allow Archetype files to specify their IGM model
         self.igm_model = 'Inoue14'
+        # TODO: Allow Archetype files to specify bvls or nnls or pca solver method
+        self._solver_method = 'bvls'
 
         self.method = 'ARCH'  # for API symmetry with Template.method
 
@@ -217,7 +219,7 @@ def eval(self, subtype, coeff, wave, z, R=None, legcoeff=None):
 
         return model
     
-    def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest, zzchi2, trans, per_camera, dedges=None, binned=None, use_gpu=False, prior=None, ncam=None):
+    def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest, zzchi2, trans, per_camera, dedges=None, binned=None, use_gpu=False, prior_sigma=None, ncam=None):
         
         """Nearest neighbour archetype approach; fitting with a combinating of nearest archetypes in chi2 space
         
@@ -235,7 +237,7 @@ def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest,
             dedges (dict): in GPU mode, use pre-computed dict of wavelength bin edges, already on GPU
             binned (dict): already computed dictionary of rebinned fluxes
             use_gpu (bool): use GPU or not
-            prior (2d array): prior matrix on coefficients (1/sig**2)
+            prior_sigma (float): prior to add in the final solution matrix: added as 1/(prior_sigma**2) only for per-camera mode
             ncam (int): Number of camera for given Instrument
         
         Returns:
@@ -249,11 +251,11 @@ def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest,
         #is multiplied by trans in get_best_archetype
         spectra = target.spectra
         nleg = target.nleg
+        bands = None
         legendre = target.legendre(nleg=nleg, use_gpu=False) #Get previously calculated legendre
 
         nleg = legendre[list(legendre.keys())[0]].shape[0]
         iBest = np.argsort(zzchi2)[0:n_nearest]
-        tdata = dict()
         if (binned is not None):
             if (use_gpu):
                 binned = { hs:binned[hs][:,iBest].get() for hs in binned }
@@ -268,17 +270,23 @@ def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest,
                     #Only multiply if trans[hs] is not None
                     #Both arrays are on CPU so no need to wrap with asarray
                     binned[hs] *= trans[hs][:,None]
-        for hs, w in dwave.items():
-            tdata[hs] = binned[hs][None,:,:]
-            if (nleg > 0):
-                tdata[hs] = np.append(tdata[hs], legendre[hs].transpose()[None,:,:], axis=2)
-            nbasis = tdata[hs].shape[2]
             
         if per_camera:
             #Use CPU mode since small tdata
-            (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux, nleg, 1, method='bvls', n_nbh=n_nearest, prior=prior, use_gpu=False, bands=target.bands)
+            #CW - 4/19/24 - pass use_gpu and solver method to per_camera_coeff_with_least_square_batch
+            #it will decide there if narch == 1 to use CPU and it will calculate correct prior array
+            #CW - 5/2/24 - pass binned instead of tdata to put all BVLS/NNLS code into per_camera_coeff_with_least_square_batch
+            (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflux, nleg, 1, method=self._solver_method, n_nbh=n_nearest, prior_sigma=prior_sigma, use_gpu=use_gpu, bands=bands)
+            #(zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflux, nleg, 1, method=self._solver_method, n_nbh=n_nearest, prior_sigma=prior_sigma, use_gpu=use_gpu, ncam=ncam)
         else:
             #Use CPU mode for calc_zchi2 since small tdata
+            #Calculate tdata here because binned is passed to per_camera_coeff_with_least_square_batch
+            tdata = dict()
+            for hs, w in dwave.items():
+                tdata[hs] = binned[hs][None,:,:]
+                if (nleg > 0):
+                    tdata[hs] = np.append(tdata[hs], legendre[hs].transpose()[None,:,:], axis=2)
+                nbasis = tdata[hs].shape[2]
             (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, tdata, weights, flux, wflux, 1, nbasis, use_gpu=False)
 
         sstype = ['%s'%(self._subtype[k]) for k in iBest] # subtypes of best archetypes
@@ -287,7 +295,7 @@ def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest,
         #print(z, zzchi2, zzcoeff, fsstype)
         return zzchi2[0], zzcoeff[0], make_fulltype(self._rrtype, fsstype)
 
-    def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nearest, trans=None, solve_method='bvls', prior=None, use_gpu=False):
+    def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nearest, trans=None, prior_sigma=None, use_gpu=False):
 
         """Get the best archetype for the given redshift and spectype.
 
@@ -301,8 +309,7 @@ def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nea
             per_camera (bool): True if fitting needs to be done in each camera
             n_nearest (int): number of nearest neighbours to be used in chi2 space (including best archetype)
             trans (dict): pass previously calcualated Lyman transmission instead of recalculating
-            solve_method (string): bvls or pca
-            prior (2d array): prior matrix on coefficients (1/sig**2)
+            prior_sigma (float): prior to add in the final solution matrix: added as 1/(prior_sigma**2) only for per-camera mode
             use_gpu (bool): use GPU or not
             
         Returns:
@@ -314,6 +321,7 @@ def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nea
         spectra = target.spectra
         nleg = target.nleg
         legendre = target.legendre(nleg=nleg, use_gpu=use_gpu) #Get previously calculated legendre
+        solve_method = self._solver_method #get solve method from archetype class instead of passing as arg
         bands = target.bands
 
         #Select np or cp for operations as arrtype
@@ -339,9 +347,7 @@ def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nea
         #new_keys = [wkeys[0], wkeys[2], wkeys[1]]
         #obs_wave = np.concatenate([dwave[key] for key in new_keys])
         
-        nleg = legendre[list(legendre.keys())[0]].shape[0]
-        zzchi2 = np.zeros(self._narch, dtype=np.float64)
-        zzcoeff = np.zeros((self._narch,  1+ncam*(nleg)), dtype=np.float64)
+        #nleg = legendre[list(legendre.keys())[0]].shape[0]
         
         #TODO: return best fit model as well
         #zzmodel = np.zeros((self._narch, obs_wave.size), dtype=np.float64)
@@ -358,41 +364,47 @@ def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nea
         #Rebin in batch
         binned = self.rebin_template_batch(z, dwave, trapz=True, dedges=dedges, use_gpu=use_gpu)
 
-        tdata = dict()
-        nbasis = 1
-        
         ## Prior must be redefined to remove nearest neighbour approach, 
         # because prior was defined based on n_nearest argument..
         # this logic is needed because the first fitting is done with just one archetype 
         #and then nearest neighbour approach is implemented
-        if n_nearest is not None:
-            nnearest_prior = prior.copy() # prior corresponding to nearest_nbh method
-            prior = prior[n_nearest-1:,][:,n_nearest-1:] # removing first rows/columns corresponding to the nearest_archetypes, and keeping just one row for one archetype
+
+        ##CW 04/19/24 - no need to redefine prior since it is now not calculated until per_camera_coeff_with_least_square_batch
+        #if n_nearest is not None:
+        #    nnearest_prior = prior.copy() # prior corresponding to nearest_nbh method
+        #    prior = prior[n_nearest-1:,][:,n_nearest-1:] # removing first rows/columns corresponding to the nearest_archetypes, and keeping just one row for one archetype
 
         for hs, wave in dwave.items():
             if (trans[hs] is not None):
                 #Only multiply if trans[hs] is not None
                 binned[hs] *= arrtype.asarray(trans[hs][:,None])
-            #Create 3-d tdata with narch x nwave x nbasis where nbasis = 1+nleg
-            if nleg > 0:
-                tdata[hs] = arrtype.append(binned[hs].transpose()[:,:,None], arrtype.tile(arrtype.asarray(legendre[hs]).transpose()[None,:,:], (self._narch, 1, 1)), axis=2)
-            else:
-                tdata[hs] = binned[hs].transpose()[:,:,None]
-            nbasis = tdata[hs].shape[2]
 
         if per_camera:
+            #Use per_camera_coeff_with_least_square_batch which has all logic associated with BVLS/NNLS solver methods
             if (use_gpu):
-                (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, tdata, gpuweights, gpuflux, gpuwflux, nleg, self._narch, method=solve_method, n_nbh=1, prior=prior, use_gpu=use_gpu, bands=bands)
+                (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, gpuweights, gpuflux, gpuwflux, nleg, self._narch, method=solve_method, n_nbh=1, prior_sigma=prior_sigma, use_gpu=use_gpu, bands=bands)
+                #(zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, gpuweights, gpuflux, gpuwflux, nleg, self._narch, method=solve_method, n_nbh=1, prior_sigma=prior_sigma, use_gpu=use_gpu, ncam=ncam)
             else:
-                (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux, nleg, self._narch, method=solve_method, n_nbh=1, prior=prior, use_gpu=use_gpu, bands=bands)
+                (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflux, nleg, self._narch, method=solve_method, n_nbh=1, prior_sigma=prior_sigma, use_gpu=use_gpu, bands=bands)
+                #(zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflux, nleg, self._narch, method=solve_method, n_nbh=1, prior_sigma=prior_sigma, use_gpu=use_gpu, ncam=ncam)
         else:
+            #Calculate tdata here because binned is passed to per_camera_coeff_with_least_square_batch
+            tdata = dict()
+            nbasis = 1
+            for hs, wave in dwave.items():
+                #Create 3-d tdata with narch x nwave x nbasis where nbasis = 1+nleg
+                if nleg > 0:
+                    tdata[hs] = arrtype.append(binned[hs].transpose()[:,:,None], arrtype.tile(arrtype.asarray(legendre[hs]).transpose()[None,:,:], (self._narch, 1, 1)), axis=2)
+                else:
+                    tdata[hs] = binned[hs].transpose()[:,:,None]
+                nbasis = tdata[hs].shape[2]
             if (use_gpu):
                 (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, tdata, gpuweights, gpuflux, gpuwflux, self._narch, nbasis, use_gpu=use_gpu)
             else:
                 (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, tdata, weights, flux, wflux, self._narch, nbasis, use_gpu=use_gpu)
 
         if n_nearest is not None:
-            best_chi2, best_coeff, best_fulltype = self.nearest_neighbour_model(target,weights,flux,wflux,dwave,z, n_nearest, zzchi2, trans, per_camera, dedges=dedges, binned=binned, use_gpu=use_gpu, prior=nnearest_prior, ncam=ncam)
+            best_chi2, best_coeff, best_fulltype = self.nearest_neighbour_model(target,weights,flux,wflux,dwave,z, n_nearest, zzchi2, trans, per_camera, dedges=dedges, binned=binned, use_gpu=use_gpu, prior_sigma=prior_sigma, ncam=ncam)
             #print(best_chi2, best_coeff, best_fulltype)
             return best_chi2, best_coeff, best_fulltype
         else:
diff --git a/py/redrock/fitz.py b/py/redrock/fitz.py
index 400f3b1..6a10d49 100644
--- a/py/redrock/fitz.py
+++ b/py/redrock/fitz.py
@@ -106,25 +106,13 @@ def minfit(x, y):
 
     return (x0, xerr, y0, zwarn)
 
-def prior_on_coeffs(n_nbh, deg_legendre, sigma, ncamera):
-    
-    """
-    Args:
-        n_nbh (int): number of dominant archetypes
-        deg_legendre (int): number of Legendre polynomials 
-        sigma (int): prior sigma to be used for archetype fitting
-        ncamera (int): number of cameras for given instrument
-    Returns:
-        2d array to be added while solving for archetype fitting
-
-    """
-    
-    nbasis = n_nbh+deg_legendre*ncamera # 3 desi cameras
-    prior = np.zeros((nbasis, nbasis), dtype='float64');np.fill_diagonal(prior, 1/(sigma**2))
-    for i in range(n_nbh):
-        prior[i][i]=0. ## Do not add prior to the archetypes, added only to the Legendre polynomials
-    return prior
+def legendre_calculate(nleg, dwave):
+    wave = np.concatenate([ w for w in dwave.values() ])
+    wmin = wave.min()
+    wmax = wave.max()
+    legendre = { hs:np.array([scipy.special.legendre(i)( (w-wmin)/(wmax-wmin)*2.) for i in range(nleg)]) for hs, w in dwave.items() }
 
+    return legendre
 
 def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=False, deg_legendre=None,
          zminfit_npoints=15, per_camera=False, n_nearest=None, prior_sigma=None):
@@ -293,6 +281,8 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=
                 coeff = np.zeros(template.nbasis)
                 zwarn |= ZW.Z_FITLIMIT
                 zwarn |= ZW.BAD_MINFIT
+                #Set trans to None so as not to pass an empty dict to archetypes! CW 2/26/24
+                trans = None
             else:
                 #- Unknown problem; re-raise error
                 raise err
@@ -331,12 +321,9 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=
                 else:
                     ncamera = 1
                 if n_nearest is None:
-                    prior = prior_on_coeffs(1, deg_legendre, prior_sigma, ncamera)
-                else:
-                    prior = prior_on_coeffs(n_nearest, deg_legendre, prior_sigma, ncamera)
-            else:
-                prior=None
-            chi2min, coeff, fulltype = archetype.get_best_archetype(target,weights,flux,wflux,dwave,zbest, per_camera, n_nearest, trans=trans, use_gpu=use_gpu, prior=prior)
+                    n_nearest = 1
+            #Pass prior_sigma as a scalar and move calculation of prior array to zscan.py in per_camera_coeff_with_least_square_batch
+            chi2min, coeff, fulltype = archetype.get_best_archetype(target,weights,flux,wflux,dwave,zbest, per_camera, n_nearest, trans=trans, use_gpu=use_gpu, prior_sigma=prior_sigma)
             del trans
 
             results.append(dict(z=zbest, zerr=zerr, zwarn=zwarn,
diff --git a/py/redrock/zscan.py b/py/redrock/zscan.py
index 0b1fee2..10a0c8d 100644
--- a/py/redrock/zscan.py
+++ b/py/redrock/zscan.py
@@ -238,8 +238,41 @@ def calc_zchi2_one(spectra, weights, flux, wflux, tdata, solve_matrices_algorith
 
     return zchi2, zcoeff
 
-def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux, nleg, narch, method=None, n_nbh=None, prior=None, use_gpu=False, bands=None):
-    
+def prior_on_coeffs(n_nbh, deg_legendre, sigma, ncamera, add_negative_legendre_terms=False):
+
+    """
+    Args:
+        n_nbh (int): number of dominant archetypes
+        deg_legendre (int): number of Legendre polynomials
+        sigma (int): prior sigma to be used for archetype fitting
+        ncamera (int): number of cameras for given instrument
+        add_negative_legendre_terms (bool): whether to adjust prior
+            to account for negative legendre terms
+    Returns:
+        2d array to be added while solving for archetype fitting
+
+    """
+
+    tot_legendre_terms = deg_legendre
+    if add_negative_legendre_terms:
+        #Double total legendre term count to reflect negative terms
+        tot_legendre_terms = deg_legendre*2
+    nbasis = n_nbh+tot_legendre_terms*ncamera # 3 desi cameras
+    prior = np.zeros((nbasis, nbasis), dtype='float64');np.fill_diagonal(prior, 1/(sigma**2))
+    for i in range(n_nbh):
+        prior[i][i]=0. ## Do not add prior to the archetypes, added only to the Legendre polynomials
+
+    if add_negative_legendre_terms:
+        #Add cross terms to prior array
+        for i in range(ncamera):
+            for j in range(deg_legendre):
+                idx1 = n_nbh+i*tot_legendre_terms+j
+                idx2 = n_nbh+i*tot_legendre_terms+deg_legendre+j
+                prior[idx1][idx2] = -prior[idx1][idx1]
+                prior[idx2][idx1] = -prior[idx2][idx2]
+    return prior
+
+def per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflux, nleg, narch, method=None, n_nbh=None, prior_sigma=None, use_gpu=False, bands=None):
     """This function calculates coefficients for archetype mode in each camera using normal linear algebra matrix solver or BVLS (bounded value least square) method
 
     BVLS described in : https://www.stat.berkeley.edu/~stark/Preprints/bvls.pdf
@@ -248,7 +281,7 @@ def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux
 
     Args:
         target (object): target object
-        tdata (dict): template data for model fit for ALL archetypes
+        binned (dict): template data for model fit for ALL archetypes
         weights (array): concatenated spectral weights (ivar).
         flux (array): concatenated flux values.
         wflux (array): concatenated weighted flux values.
@@ -268,8 +301,9 @@ def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux
     ### TODO - implement BVLS on GPU
     # number of cameras in DESI: b, r, z
     spectra = target.spectra
+    if bands is None:
+        bands = ['b', 'z', 'r']
     ncam = len(bands)
-    nbasis = n_nbh+nleg*ncam # n_nbh : for actual physical archetype(s), nleg: number of legendre polynomials, ncamera: number of cameras
 
     ret_zcoeff = {}
     ret_zcoeff['alpha'] = []
@@ -279,8 +313,56 @@ def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux
 
     new_bands = sorted(bands) # saves as correct order
 
-    #Setup dict of solver args to pass bounds to solver
     method = method.upper()
+    add_negative_legendre_terms = False
+    if (method == 'BVLS' and use_gpu):
+        add_negative_legendre_terms = True
+
+    #Calculate prior here
+    prior = prior_on_coeffs(n_nbh, nleg, prior_sigma, ncam, add_negative_legendre_terms)
+    legendre = target.legendre(nleg=nleg, use_gpu=False) #Get previously calculated legendre
+
+    if add_negative_legendre_terms:
+        #Hard-code NNLS with additional legendre terms if BVLS AND gpu mode
+        method = 'NNLS'
+        #tdata should already have the additional negative legendre terms
+        nleg *= 2
+        if narch == 1:
+            #Use CPU if only 1 narch (nearest neighbor)
+            use_gpu = False
+
+    #Calculate nbasis after accounting for legendre terms
+    nbasis = n_nbh+nleg*ncam # n_nbh : for actual physical archetype(s), nleg: number of legendre polynomials, ncamera: number of cameras
+
+    #Calculate tdata arrays here instead of in archetypes
+    #Select np or cp for operations as arrtype
+    if (use_gpu):
+        import cupy as cp
+        arrtype = cp
+    else:
+        arrtype = np
+    tdata = dict()
+    for hs in binned: 
+        #Create 3-d tdata with narch x nwave x nbasis where nbasis = 1+nleg
+        if narch == 1:
+            tdata[hs] = binned[hs][None,:,:]
+            if nleg > 0:
+                tdata[hs] = np.append(tdata[hs], legendre[hs].transpose()[None,:,:], axis=2)
+                if add_negative_legendre_terms:
+                #if method == 'BVLS' and use_gpu:
+                    #Using BVLS and GPU - append negative copies of legendre terms as well to use NNLS with additional bases to approximate BVLS
+                    tdata[hs] = np.append(tdata[hs], -1*legendre[hs].transpose()[None,:,:], axis=2)
+        else:
+            if nleg > 0:
+                tdata[hs] = arrtype.append(binned[hs].transpose()[:,:,None], arrtype.tile(arrtype.asarray(legendre[hs]).transpose()[None,:,:], (narch, 1, 1)), axis=2)
+                if add_negative_legendre_terms:
+                #if method == 'BVLS' and use_gpu:
+                    #Using BVLS - append negative copies of legendre terms as well to use NNLS with additional bases to approximate BVLS
+                    tdata[hs] = arrtype.append(tdata[hs], arrtype.tile(-1*arrtype.asarray(legendre[hs]).transpose()[None,:,:], (narch, 1, 1)), axis=2)
+            else:
+                tdata[hs] = binned[hs].transpose()[:,:,None]
+
+    #Setup dict of solver args to pass bounds to solver
     solver_args = dict()
     if (method == 'BVLS'):
         #only positive coefficients are allowed for the archetypes
@@ -317,6 +399,8 @@ def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux
                         tdata2[hs][:,k+nleg*i] = tdata[hs][j,:,k] # Legendre polynomials terms
                 tdata2[hs] = tdata2[hs][None,:,:]
             zzchi2[j], zzcoeff[j] = calc_zchi2_batch(spectra, tdata2, weights, flux, wflux, 1, nbasis, solve_matrices_algorithm=method, solver_args=solver_args, prior=prior, use_gpu=use_gpu)
+    if (add_negative_legendre_terms):
+        zzcoeff = remove_extra_legendre_terms(zzcoeff, n_nbh, ncam, nleg)
 
     # saving leading archetype coefficients in correct order
     ret_zcoeff['alpha'] = [zzcoeff[:,k] for k in range(n_nbh)] # archetype coefficient(s)
@@ -337,6 +421,15 @@ def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux
     #print(f'{time.time()-start} [sec] took for per camera BVLS method\n')
     return zzchi2, coeff
 
+def remove_extra_legendre_terms(zzcoeff, n_nbh, ncam, nleg):
+    nleg = nleg // 2
+    zzcoeff2 = np.zeros_like(zzcoeff, shape=(zzcoeff.shape[0], n_nbh+ncam*(nleg)))
+    zzcoeff2[:,:n_nbh] = zzcoeff[:,:n_nbh]
+    for j in range(ncam):
+        for l in range(nleg):
+            zzcoeff2[:,n_nbh+nleg*j+l] = zzcoeff[:,n_nbh+nleg*j*2+l] - zzcoeff[:,n_nbh+nleg*j*2+nleg+l]
+    return zzcoeff2
+
 def batch_dot_product_sparse(spectra, tdata, nz, use_gpu):
     """Calculate a batch dot product of the 3 sparse matrices in spectra
     with every template in tdata.  Sparse matrix libraries are used