diff --git a/tools/fregrid_gpu/conserve_interp_gpu.c b/tools/fregrid_gpu/conserve_interp_gpu.c
index e8426067..26701ea2 100644
--- a/tools/fregrid_gpu/conserve_interp_gpu.c
+++ b/tools/fregrid_gpu/conserve_interp_gpu.c
@@ -18,6 +18,7 @@
* .
**********************************************************************/
#include
+#include
#include
#include
#include
@@ -51,7 +52,6 @@ void setup_conserve_interp_gpu(int ntiles_input_grid, Grid_config *input_grid, i
int nlon_output_cells = output_grid[otile].nxc;
int nlat_output_cells = output_grid[otile].nyc;
- int ncells_output_grid = nlon_output_cells*nlat_output_cells;
int ngridpts_output_grid = (nlon_output_cells+1)*(nlat_output_cells+1);
Grid_cells_struct_config output_grid_cells;
@@ -80,7 +80,9 @@ void setup_conserve_interp_gpu(int ntiles_input_grid, Grid_config *input_grid, i
get_bounding_indices_gpu(nlon_output_cells, nlat_output_cells, nlon_input_cells, nlat_input_cells,
output_grid[otile].latc, input_grid[itile].latc, &jlat_overlap_starts, &jlat_overlap_ends);
- create_upbound_nxcells_arrays_on_device_gpu( ncells_input_grid, &approx_nxcells_per_ij1, &ij2_start, &ij2_end);
+ approx_nxcells_per_ij1 = (int *) acc_malloc(ncells_input_grid*sizeof(int));
+ ij2_start = (int *) acc_malloc(ncells_input_grid*sizeof(int));
+ ij2_end = (int *) acc_malloc(ncells_input_grid*sizeof(int));
upbound_nxcells = get_upbound_nxcells_2dx2d_gpu( nlon_input_cells, nlat_input_cells,
nlon_output_cells, nlat_output_cells,
@@ -126,13 +128,25 @@ void setup_conserve_interp_gpu(int ntiles_input_grid, Grid_config *input_grid, i
else mpp_error("conserve_interp: interp_method should be CONSERVE_ORDER1 or CONSERVE_ORDER2");
} //conserve_order methods
- free_upbound_nxcells_arrays_gpu(ncells_input_grid, &approx_nxcells_per_ij1, &ij2_start, &ij2_end);
+ acc_free(approx_nxcells_per_ij1); approx_nxcells_per_ij1 = NULL;
+ acc_free(ij2_start); ij2_start = NULL;
+ acc_free(ij2_end); ij2_end = NULL;
+
free_input_grid_mask_gpu(ncells_input_grid, &input_grid_mask);
delete_grid_from_device_gpu(ngridpts_input_grid, input_grid[itile].lonc, input_grid[itile].latc);
} //input tile
- free_grid_cell_struct_gpu( ncells_output_grid, &output_grid_cells);
+
+ acc_free(output_grid_cells.lon_min); output_grid_cells.lon_min = NULL;
+ acc_free(output_grid_cells.lon_max); output_grid_cells.lon_max = NULL;
+ acc_free(output_grid_cells.lon_cent); output_grid_cells.lon_cent = NULL;
+ acc_free(output_grid_cells.lat_min); output_grid_cells.lat_min = NULL;
+ acc_free(output_grid_cells.lat_max); output_grid_cells.lat_max = NULL;
+ acc_free(output_grid_cells.area); output_grid_cells.area = NULL;
+ acc_free(output_grid_cells.nvertices); output_grid_cells.nvertices=NULL;
+ acc_free(output_grid_cells.lon_vertices); output_grid_cells.lon_vertices = NULL;
+ acc_free(output_grid_cells.lat_vertices); output_grid_cells.lat_vertices = NULL;
delete_grid_from_device_gpu(ngridpts_output_grid, output_grid[otile].lonc, output_grid[otile].latc);
}//output tile
@@ -479,16 +493,14 @@ void do_scalar_conserve_interp_gpu(Interp_config_gpu *interp_gpu, int varid, int
int ncells_output_grid = output_grid[otile].nxc * output_grid[otile].nyc;
double *p_fieldout_data = NULL; p_fieldout_data = field_out[otile].data;
- int *out_miss = NULL ; out_miss = (int *)malloc(ncells_output_grid*sizeof(int));
- double *out_area = NULL ; out_area = (double *)malloc(ncells_output_grid*sizeof(double));
-#pragma acc enter data create(p_fieldout_data[:ncells_output_grid], \
- out_area[:ncells_output_grid], \
- out_miss[:ncells_output_grid])
+ int *out_miss = NULL ; out_miss = (int *) acc_malloc(ncells_output_grid*sizeof(int));
+ double *out_area = NULL ; out_area = (double *) acc_malloc(ncells_output_grid*sizeof(double));
-#pragma acc parallel loop present(p_fieldout_data[:ncells_output_grid], \
- out_area[:ncells_output_grid], \
- out_miss[:ncells_output_grid])
+#pragma acc enter data create(p_fieldout_data[:ncells_output_grid])
+
+#pragma acc parallel loop present(p_fieldout_data[:ncells_output_grid]) deviceptr(out_area, \
+ out_miss)
for(int i=0; i 0) gsum_out += p_fieldout_data[i];
}
}
if ( cell_methods == CELL_METHODS_SUM ) {
-#pragma acc parallel loop present(out_area[:ncells_output_grid], \
- out_miss[:ncells_output_grid], \
- p_fieldout_data[:ncells_output_grid])
+#pragma acc parallel loop present(p_fieldout_data[:ncells_output_grid]) deviceptr(out_miss)
+
for(int i=0; i 0) {
p_fieldout_data[i] /= out_area[i];
@@ -553,7 +562,7 @@ void do_scalar_conserve_interp_gpu(Interp_config_gpu *interp_gpu, int varid, int
}
if( (target_grid) ) {
-#pragma acc parallel loop present(out_area[:ncells_output_grid])
+#pragma acc parallel loop deviceptr(out_area)
for(int i=0; inxcells;
#pragma acc parallel loop present(minterp_gpu->output_parent_cell_index[:ixcells], \
minterp_gpu->input_parent_cell_index[:ixcells], \
- minterp_gpu->xcell_area[:ixcells], \
- out_area[:ncells_output_grid])\
+ minterp_gpu->xcell_area[:ixcells])\
copyin(p_fieldin_area[:ncells_input_grid],\
- p_gridin_area[:ncells_input_grid])
+ p_gridin_area[:ncells_input_grid])\
+ deviceptr(out_area)
for(int ix=0; ixoutput_parent_cell_index[ix];
int ij1 = minterp_gpu->input_parent_cell_index[ix];
@@ -575,8 +584,9 @@ void do_scalar_conserve_interp_gpu(Interp_config_gpu *interp_gpu, int varid, int
if(cell_measures ) out_area[ij2] += (area*p_fieldin_area[ij1]/p_gridin_area[ij1]);
else out_area[ij2] += area;
}
-#pragma acc parallel loop present(p_fieldout_data[:ncells_output_grid], out_area[:ncells_output_grid], \
- output_grid[otile].cell_area[:ncells_output_grid])
+#pragma acc parallel loop present(p_fieldout_data[:ncells_output_grid], \
+ output_grid[otile].cell_area[:ncells_output_grid])\
+ deviceptr(out_area)
for(int i=0; iweight;
if(cell_methods == CELL_METHODS_SUM) {
-#pragma acc parallel loop present(input_area_weight[:ncells_input_grid]) copyin(p_gridin_area[:ncells_input_grid])
+#pragma acc parallel loop deviceptr(input_area_weight) copyin(p_gridin_area[:ncells_input_grid])
for(int i=0 ; iinput_parent_cell_index[:nxcells], \
minterp_gpu->output_parent_cell_index[:nxcells], \
minterp_gpu->xcell_area[:nxcells], \
- input_area_weight[:ncells_input_grid], \
- fieldout_data[:ncells_output_grid], \
- out_area[:ncells_output_grid], \
- out_miss[:ncells_output_grid]) \
+ fieldout_data[:ncells_output_grid]) \
copyin(fieldin_data[:ncells_input_grid])
-#pragma acc parallel loop
+#pragma acc parallel loop deviceptr(input_area_weight, \
+ out_area, \
+ out_miss)
for(int ix=0; ixinput_parent_cell_index[ix];
int ij2 = minterp_gpu->output_parent_cell_index[ix];
@@ -725,14 +733,13 @@ void interp_data_order2( const Grid_config *output_grid, const Grid_config *inpu
minterp_gpu->input_parent_cell_index[:nxcells], \
minterp_gpu->output_parent_cell_index[:nxcells], \
minterp_gpu->xcell_area[:nxcells], \
- input_area_weight[:ncells_input_grid], \
- fieldout_data[:ncells_output_grid], \
- out_area[:ncells_output_grid], \
- out_miss[:ncells_output_grid]) \
+ fieldout_data[:ncells_output_grid]) \
copyin(fieldin_data[:input_data_ncells], \
grad_mask[:ncells_input_grid], \
grad_x[:ncells_input_grid], grad_y[:ncells_input_grid])
-#pragma acc parallel loop
+#pragma acc parallel loop deviceptr(input_area_weight, \
+ out_area, \
+ out_miss)
for(int ix=0; ixinput_parent_cell_index[ix];
int ij2 = minterp_gpu->output_parent_cell_index[ix];
diff --git a/tools/fregrid_gpu/interp_utils_gpu.c b/tools/fregrid_gpu/interp_utils_gpu.c
index 54015ece..1c0d45aa 100644
--- a/tools/fregrid_gpu/interp_utils_gpu.c
+++ b/tools/fregrid_gpu/interp_utils_gpu.c
@@ -78,15 +78,13 @@ void get_input_grid_mask_gpu(const int mask_size, double **input_grid_mask)
{
double *p_input_grid_mask;
-
- *input_grid_mask = (double *)malloc(mask_size*sizeof(double));
+ *input_grid_mask = (double*) acc_malloc(mask_size*sizeof(double));
p_input_grid_mask = *input_grid_mask;
-
-#pragma acc enter data create(p_input_grid_mask[:mask_size])
-#pragma acc parallel loop independent present(p_input_grid_mask[:mask_size])
+#pragma acc parallel loop independent deviceptr(p_input_grid_mask)
for( int i=0 ; ilon_min, \
+ output_grid_cells->lon_max, \
+ output_grid_cells->lat_min, \
+ output_grid_cells->lat_max, \
+ output_grid_cells->lon_cent, \
+ output_grid_cells->area, \
+ output_grid_cells->nvertices, \
+ output_grid_cells->lon_vertices, \
+ output_grid_cells->lat_vertices, \
+ skip_input_cells, \
+ approx_xcells_per_ij1, \
+ ij2_start, \
+ ij2_end)
+ for( int ij1=ij1_start; ij1 MASK_THRESH ) {
int i_approx_xcells_per_ij1=0;
@@ -157,32 +165,36 @@ int create_xgrid_2dx2d_order1_gpu(const int nlon_input_cells, const int nlat_in
double xcell_dclon=-99.99, xcell_dclat=-99.99;
- int *parent_input_index = NULL ; parent_input_index = (int *)malloc(upbound_nxcells*sizeof(int));
- int *parent_output_index = NULL ; parent_output_index = (int *)malloc(upbound_nxcells*sizeof(int));
- int *nxcells_per_ij1 = NULL ; nxcells_per_ij1 = (int *)malloc(input_grid_ncells*sizeof(int));
- double *store_xcell_area = NULL ; store_xcell_area = (double *)malloc(upbound_nxcells*sizeof(double));
+ int *parent_input_index = NULL; parent_input_index = (int*) acc_malloc(upbound_nxcells*sizeof(int));
+ int *parent_output_index = NULL; parent_output_index = (int*) acc_malloc(upbound_nxcells*sizeof(int));
+ int *nxcells_per_ij1 = NULL; nxcells_per_ij1 = (int*) acc_malloc(input_grid_ncells*sizeof(int));
+ double *store_xcell_area = NULL; store_xcell_area = (double*) acc_malloc(upbound_nxcells*sizeof(double));
-#pragma acc enter data create(parent_input_index[:upbound_nxcells], \
- parent_output_index[:upbound_nxcells], \
- store_xcell_area[:upbound_nxcells], \
- nxcells_per_ij1[:input_grid_ncells])
#pragma acc data present(output_grid_lon[:output_grid_npts], \
output_grid_lat[:output_grid_npts], \
input_grid_lon[:input_grid_npts], \
- input_grid_lat[:input_grid_npts], \
- output_grid_cells[:1], \
- approx_nxcells_per_ij1[:input_grid_ncells], \
- ij2_start[:input_grid_ncells], \
- ij2_end[:input_grid_ncells], \
- mask_input_grid[:input_grid_ncells], \
- nxcells_per_ij1[:input_grid_ncells], \
- parent_input_index[:upbound_nxcells], \
- parent_output_index[:upbound_nxcells], \
- store_xcell_area[:upbound_nxcells]) \
+ input_grid_lat[:input_grid_npts]) \
copyin(input_grid_ncells, output_grid_ncells) \
copy(nxcells)
-#pragma acc parallel loop reduction(+:nxcells)
+#pragma acc parallel loop reduction(+:nxcells) \
+ deviceptr(nxcells_per_ij1, \
+ store_xcell_area, \
+ parent_input_index, \
+ parent_output_index, \
+ output_grid_cells->lon_min, \
+ output_grid_cells->lon_max, \
+ output_grid_cells->lat_min, \
+ output_grid_cells->lat_max, \
+ output_grid_cells->lon_cent, \
+ output_grid_cells->area, \
+ output_grid_cells->nvertices, \
+ output_grid_cells->lon_vertices, \
+ output_grid_cells->lat_vertices, \
+ approx_nxcells_per_ij1, \
+ mask_input_grid, \
+ ij2_start, \
+ ij2_end)
for(int ij1=ij1_start; ij1 MASK_THRESH) {
@@ -261,15 +273,11 @@ int create_xgrid_2dx2d_order1_gpu(const int nlon_input_cells, const int nlat_in
approx_nxcells_per_ij1, parent_input_index, parent_output_index,
store_xcell_area, interp_for_itile);
-#pragma acc exit data delete( parent_input_index[:upbound_nxcells], \
- parent_output_index[:upbound_nxcells], \
- store_xcell_area[:upbound_nxcells], \
- nxcells_per_ij1[:input_grid_ncells])
+ acc_free(parent_input_index) ; parent_input_index = NULL;
+ acc_free(parent_output_index); parent_output_index = NULL;
+ acc_free(nxcells_per_ij1) ; nxcells_per_ij1 = NULL;
+ acc_free(store_xcell_area) ; store_xcell_area = NULL;
- free(parent_input_index) ; parent_input_index = NULL;
- free(parent_output_index); parent_output_index = NULL;
- free(nxcells_per_ij1) ; nxcells_per_ij1 = NULL;
- free(store_xcell_area) ; store_xcell_area = NULL;
return nxcells;
@@ -303,44 +311,45 @@ int create_xgrid_2dx2d_order2_gpu(const int nlon_input_cells, const int nlat_in
int ij1_start = jlat_overlap_starts*nlon_input_cells;
int ij1_end = (jlat_overlap_ends+1)*nlon_input_cells;
- int *parent_input_index=NULL ; parent_input_index = (int *)malloc(upbound_nxcells*sizeof(int));
- int *parent_output_index=NULL ; parent_output_index = (int *)malloc(upbound_nxcells*sizeof(int));
- double *store_xcell_area=NULL ; store_xcell_area = (double *)malloc(upbound_nxcells*sizeof(double));
- double *store_xcell_dclon=NULL ; store_xcell_dclon = (double *)malloc(upbound_nxcells*sizeof(double));
- double *store_xcell_dclat=NULL ; store_xcell_dclat = (double *)malloc(upbound_nxcells*sizeof(double));
-
- int *nxcells_per_ij1=NULL ; nxcells_per_ij1 = (int *)malloc(input_grid_ncells*sizeof(int));
- double *summed_input_area=NULL; summed_input_area = (double *)malloc(input_grid_ncells*sizeof(double));
- double *summed_input_clat=NULL; summed_input_clat = (double *)malloc(input_grid_ncells*sizeof(double));
- double *summed_input_clon=NULL; summed_input_clon = (double *)malloc(input_grid_ncells*sizeof(double));
-
-#pragma acc enter data create(parent_input_index[:upbound_nxcells], \
- parent_output_index[:upbound_nxcells], \
- store_xcell_area[:upbound_nxcells], \
- nxcells_per_ij1[:input_grid_ncells], \
- store_xcell_dclon[:upbound_nxcells], \
- store_xcell_dclat[:upbound_nxcells], \
- summed_input_area[:input_grid_ncells], \
- summed_input_clon[:input_grid_ncells], \
- summed_input_clat[:input_grid_ncells])
+ int *parent_input_index=NULL ; parent_input_index = (int *) acc_malloc(upbound_nxcells*sizeof(int));
+ int *parent_output_index=NULL ; parent_output_index = (int *)acc_malloc(upbound_nxcells*sizeof(int));
+ double *store_xcell_area=NULL ; store_xcell_area = (double *) acc_malloc(upbound_nxcells*sizeof(double));
+ double *store_xcell_dclon=NULL ; store_xcell_dclon = (double *) acc_malloc(upbound_nxcells*sizeof(double));
+ double *store_xcell_dclat=NULL ; store_xcell_dclat = (double *) acc_malloc(upbound_nxcells*sizeof(double));
+
+ int *nxcells_per_ij1=NULL ; nxcells_per_ij1 = (int *) acc_malloc(input_grid_ncells*sizeof(int));
+ double *summed_input_area=NULL; summed_input_area = (double *) acc_malloc(input_grid_ncells*sizeof(double));
+ double *summed_input_clat=NULL; summed_input_clat = (double *) acc_malloc(input_grid_ncells*sizeof(double));
+ double *summed_input_clon=NULL; summed_input_clon = (double *) acc_malloc(input_grid_ncells*sizeof(double));
+
#pragma acc data present(output_grid_lon[:output_grid_npts], \
output_grid_lat[:output_grid_npts], \
input_grid_lon[:input_grid_npts], \
- input_grid_lat[:input_grid_npts], \
- output_grid_cells[:1], \
- approx_nxcells_per_ij1[:input_grid_ncells], \
- ij2_start[:input_grid_ncells], \
- ij2_end[:input_grid_ncells], \
- mask_input_grid[:input_grid_ncells], \
- nxcells_per_ij1[:input_grid_ncells], \
- parent_input_index[:upbound_nxcells], \
- parent_output_index[:upbound_nxcells], \
- store_xcell_area[:upbound_nxcells], \
- store_xcell_dclon[:upbound_nxcells], \
- store_xcell_dclat[:upbound_nxcells]) \
+ input_grid_lat[:input_grid_npts]) \
copyin(input_grid_ncells, output_grid_ncells) copy(nxcells)
-#pragma acc parallel loop reduction(+:nxcells)
+#pragma acc parallel loop reduction(+:nxcells) \
+ deviceptr(parent_input_index, \
+ parent_output_index, \
+ store_xcell_area, \
+ store_xcell_dclon, \
+ store_xcell_dclat, \
+ summed_input_area, \
+ summed_input_clon, \
+ summed_input_clat, \
+ nxcells_per_ij1, \
+ output_grid_cells->lon_min, \
+ output_grid_cells->lon_max, \
+ output_grid_cells->lat_min, \
+ output_grid_cells->lon_cent, \
+ output_grid_cells->area, \
+ output_grid_cells->nvertices, \
+ output_grid_cells->lon_vertices, \
+ output_grid_cells->lat_vertices, \
+ approx_nxcells_per_ij1, \
+ mask_input_grid, \
+ ij2_start, \
+ ij2_end)
for(int ij1=ij1_start; ij1 MASK_THRESH) {
@@ -366,7 +375,8 @@ int create_xgrid_2dx2d_order2_gpu(const int nlon_input_cells, const int nlat_in
#pragma acc loop seq reduction(+:ixcell) \
reduction(+:summed_input_area_ij1) \
reduction(+:summed_input_clon_ij1) \
- reduction(+:summed_input_clat_ij1)
+ reduction(+:summed_input_clat_ij1) \
+
for(int ij2=ij2_start[ij1]; ij2<=ij2_end[ij1]; ij2++) {
int nvertices2, xvertices=1;
@@ -439,11 +449,10 @@ int create_xgrid_2dx2d_order2_gpu(const int nlon_input_cells, const int nlat_in
interp_for_itile->dcentroid_lat[:nxcells], \
input_grid_lon[:input_grid_ncells], \
input_grid_lat[:input_grid_ncells], \
- summed_input_area[:input_grid_ncells], \
- summed_input_clon[:input_grid_ncells], \
- summed_input_clat[:input_grid_ncells], \
interp_for_itile->input_parent_cell_index[:nxcells]) \
- copyin(readin_input_area[:input_grid_ncells])
+ copyin(readin_input_area[:input_grid_ncells]) deviceptr(summed_input_area, \
+ summed_input_clon, \
+ summed_input_clat)
for(int ix=0 ; ixinput_parent_cell_index[ix];
double input_area = summed_input_area[ij1];
@@ -463,25 +472,15 @@ int create_xgrid_2dx2d_order2_gpu(const int nlon_input_cells, const int nlat_in
interp_for_itile->dcentroid_lat[ix] -= input_clat/input_area;
}
-#pragma acc exit data delete( parent_input_index[:upbound_nxcells], \
- parent_output_index[:upbound_nxcells], \
- store_xcell_area[:upbound_nxcells], \
- nxcells_per_ij1[:input_grid_ncells], \
- store_xcell_dclon[:upbound_nxcells], \
- store_xcell_dclat[:upbound_nxcells], \
- summed_input_area[:input_grid_ncells], \
- summed_input_clon[:input_grid_ncells], \
- summed_input_clat[:input_grid_ncells])
-
- free(parent_input_index) ; parent_input_index = NULL;
- free(parent_output_index) ; parent_output_index = NULL;
- free(store_xcell_area) ; store_xcell_area = NULL;
- free(nxcells_per_ij1) ; nxcells_per_ij1 = NULL;
- free(store_xcell_dclon) ; store_xcell_dclon = NULL;
- free(store_xcell_dclat) ; store_xcell_dclat = NULL;
- free(summed_input_area); summed_input_area = NULL;
- free(summed_input_clon); summed_input_clon = NULL;
- free(summed_input_clat); summed_input_clat = NULL;
+ acc_free(parent_input_index); parent_input_index = NULL;
+ acc_free(parent_output_index); parent_output_index = NULL;
+ acc_free(store_xcell_area); store_xcell_area = NULL;
+ acc_free(nxcells_per_ij1); nxcells_per_ij1 = NULL;
+ acc_free(store_xcell_dclon); store_xcell_dclon = NULL;
+ acc_free(store_xcell_dclat); store_xcell_dclat = NULL;
+ acc_free(summed_input_area); summed_input_area = NULL;
+ acc_free(summed_input_clon); summed_input_clon = NULL;
+ acc_free(summed_input_clat); summed_input_clat = NULL;
return nxcells;
diff --git a/tools/libfrencutils_gpu/create_xgrid_utils_gpu.c b/tools/libfrencutils_gpu/create_xgrid_utils_gpu.c
index 706c4cb9..291d72b7 100644
--- a/tools/libfrencutils_gpu/create_xgrid_utils_gpu.c
+++ b/tools/libfrencutils_gpu/create_xgrid_utils_gpu.c
@@ -653,26 +653,26 @@ void get_grid_cell_struct_gpu( const int nlon, const int nlat, const Grid_config
int ncells=nlon*nlat;
int npts=(nlon+1)*(nlat+1);
- grid_cells->lon_min = (double *)malloc(ncells*sizeof(double));
- grid_cells->lon_max = (double *)malloc(ncells*sizeof(double));
- grid_cells->lat_min = (double *)malloc(ncells*sizeof(double));
- grid_cells->lat_max = (double *)malloc(ncells*sizeof(double));
- grid_cells->lon_cent = (double *)malloc(ncells*sizeof(double));
- grid_cells->area = (double *)malloc(ncells*sizeof(double));
- grid_cells->nvertices = (int *)malloc(ncells*sizeof(int));
- grid_cells->lon_vertices = (double *)malloc(MAX_V*ncells*sizeof(double));
- grid_cells->lat_vertices = (double *)malloc(MAX_V*ncells*sizeof(double));
-
-#pragma acc enter data create(grid_cells[:1])
-#pragma acc enter data create(grid_cells->lon_min[:ncells], grid_cells->lon_max[:ncells], \
- grid_cells->lat_min[:ncells], grid_cells->lat_max[:ncells], \
- grid_cells->lon_cent[:ncells], grid_cells->nvertices[:ncells],\
- grid_cells->area[:ncells])
-#pragma acc enter data create(grid_cells->lon_vertices[:MAX_V*ncells], \
- grid_cells->lat_vertices[:MAX_V*ncells])
-
-#pragma acc data present(grid_cells[:1], lon[:npts], lat[:npts])
-#pragma acc parallel loop independent
+ grid_cells->lon_min = (double *) acc_malloc(ncells*sizeof(double));
+ grid_cells->lon_max = (double *) acc_malloc(ncells*sizeof(double));
+ grid_cells->lat_min = (double *) acc_malloc(ncells*sizeof(double));
+ grid_cells->lat_max = (double *) acc_malloc(ncells*sizeof(double));
+ grid_cells->lon_cent = (double *) acc_malloc(ncells*sizeof(double));
+ grid_cells->area = (double *) acc_malloc(ncells*sizeof(double));
+ grid_cells->nvertices = (int *) acc_malloc(ncells*sizeof(int));
+ grid_cells->lon_vertices = (double *) acc_malloc(MAX_V*ncells*sizeof(double));
+ grid_cells->lat_vertices = (double *) acc_malloc(MAX_V*ncells*sizeof(double));
+
+#pragma acc data present(lon[:npts], lat[:npts])
+#pragma acc parallel loop independent deviceptr(grid_cells->lon_min, \
+ grid_cells->lon_max, \
+ grid_cells->lat_min, \
+ grid_cells->lat_max, \
+ grid_cells->lon_cent, \
+ grid_cells->area, \
+ grid_cells->nvertices, \
+ grid_cells->lon_vertices, \
+ grid_cells->lat_vertices)
for(int icell=0; icelllon_vertices, \
- grid_cells->lat_vertices, \
- grid_cells->lon_min, \
- grid_cells->lon_max, \
- grid_cells->lon_cent, \
- grid_cells->lat_max, \
- grid_cells->lat_min, \
- grid_cells->nvertices, \
- grid_cells->area)
-#pragma acc exit data delete(grid_cells)
-
- free(grid_cells->lon_min); grid_cells->lon_min = NULL;
- free(grid_cells->lon_max); grid_cells->lon_max = NULL;
- free(grid_cells->lon_cent); grid_cells->lon_cent = NULL;
- free(grid_cells->lat_min); grid_cells->lat_min = NULL;
- free(grid_cells->lat_max); grid_cells->lat_max = NULL;
- free(grid_cells->area); grid_cells->area = NULL;
- free(grid_cells->nvertices); grid_cells->nvertices=NULL;
- free(grid_cells->lon_vertices); grid_cells->lon_vertices = NULL;
- free(grid_cells->lat_vertices); grid_cells->lat_vertices = NULL;
-}
-
-
void get_cell_vertices_gpu( const int icell, const int nlon, const double *lon, const double *lat, double *x, double *y )
{
@@ -746,48 +720,6 @@ void get_cell_vertices_gpu( const int icell, const int nlon, const double *lon,
}
-void create_upbound_nxcells_arrays_on_device_gpu(const int n, int **approx_nxcells_per_ij1,
- int **ij2_start, int **ij2_end)
-{
-
- int *p_approx_nxcells_per_ij1;
- int *p_ij2_start;
- int *p_ij2_end;
-
- *approx_nxcells_per_ij1 = (int *)malloc(n*sizeof(int));
- *ij2_start = (int *)malloc(n*sizeof(int));
- *ij2_end = (int *)malloc(n*sizeof(int));
-
- p_approx_nxcells_per_ij1 = *approx_nxcells_per_ij1;
- p_ij2_start = *ij2_start;
- p_ij2_end = *ij2_end;
-
-#pragma acc enter data copyin(p_approx_nxcells_per_ij1[:n], \
- p_ij2_start[:n], \
- p_ij2_end[:n])
-
-}
-
-void free_upbound_nxcells_arrays_gpu( const int n, int **approx_nxcells_per_ij1,
- int **ij2_start, int **ij2_end)
-{
- int *p_approx_nxcells_per_ij1;
- int *p_ij2_start ;
- int *p_ij2_end;
-
- p_approx_nxcells_per_ij1 = *approx_nxcells_per_ij1;
- p_ij2_start = *ij2_start;
- p_ij2_end = *ij2_end;
-
-#pragma acc exit data delete(p_approx_nxcells_per_ij1[:n], \
- p_ij2_start[:n], \
- p_ij2_end[:n])
-
- free(*approx_nxcells_per_ij1); *approx_nxcells_per_ij1 = NULL;
- free(*ij2_start) ; *ij2_start = NULL;
- free(*ij2_end) ; *ij2_end = NULL;
-}
-
void copy_data_to_interp_on_device_gpu(const int nxcells, const int input_ncells, const int upbound_nxcells,
int *xcells_per_ij1, double *xcell_dclon, double *xcell_dclat,
int *approx_xcells_per_ij1, int *parent_input_index, int *parent_output_index,
@@ -813,11 +745,12 @@ void copy_data_to_interp_on_device_gpu(const int nxcells, const int input_ncells
#pragma acc enter data if(copy_xcentroid) create(interp_for_input_tile->dcentroid_lon[:nxcells], \
interp_for_input_tile->dcentroid_lat[:nxcells])
-#pragma acc data present(xcells_per_ij1[:input_ncells], approx_xcells_per_ij1[:input_ncells], \
- parent_input_index[:upbound_nxcells], \
- parent_output_index[:upbound_nxcells], \
- xcell_areas[:upbound_nxcells], interp_for_input_tile[:1])
-#pragma acc parallel loop independent async(0)
+#pragma acc data present(interp_for_input_tile[:1])
+#pragma acc parallel loop independent async(0) deviceptr(xcells_per_ij1, \
+ approx_xcells_per_ij1, \
+ parent_input_index, \
+ parent_output_index, \
+ xcell_areas)
for(int ij1=0 ; ij1