Actual source code: veccuda.c
  1: /*
  2:  Implementation of the sequential cuda vectors.
  4:  This file contains the code that can be compiled with a C
  5:  compiler.  The companion file veccuda2.cu contains the code that
  6:  must be compiled with nvcc or a C++ compiler.
  7:  */
  9: #define PETSC_SKIP_SPINLOCK
 11: #include <petscconf.h>
 12: #include <petsc/private/vecimpl.h>
 13: #include <../src/vec/vec/impls/dvecimpl.h>
 14: #include <petsc/private/cudavecimpl.h>
 16: PetscErrorCode VecCUDAGetArrays_Private(Vec v, const PetscScalar **x, const PetscScalar **x_d, PetscOffloadMask *flg)
 17: {
 19:   if (x) {
 20:     Vec_Seq *h = (Vec_Seq *)v->data;
 22:     *x = h->array;
 23:   }
 24:   if (x_d) {
 25:     Vec_CUDA *d = (Vec_CUDA *)v->spptr;
 27:     *x_d = d ? d->GPUarray : NULL;
 28:   }
 29:   if (flg) *flg = v->offloadmask;
 30:   return 0;
 31: }
 33: /*
 34:     Allocates space for the vector array on the Host if it does not exist.
 35:     Does NOT change the PetscCUDAFlag for the vector
 36:     Does NOT zero the CUDA array
 37:  */
 38: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
 39: {
 40:   PetscScalar *array;
 41:   Vec_Seq     *s = (Vec_Seq *)v->data;
 42:   PetscInt     n = v->map->n;
 44:   if (!s) {
 45:     PetscNew(&s);
 46:     v->data = s;
 47:   }
 48:   if (!s->array) {
 49:     if (n * sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
 50:       PetscMallocSetCUDAHost();
 51:       v->pinned_memory = PETSC_TRUE;
 52:     }
 53:     PetscMalloc1(n, &array);
 54:     s->array           = array;
 55:     s->array_allocated = array;
 56:     if (n * sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) PetscMallocResetCUDAHost();
 57:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) v->offloadmask = PETSC_OFFLOAD_CPU;
 58:   }
 59:   return 0;
 60: }
 62: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin, Vec yin)
 63: {
 64:   PetscScalar       *ya;
 65:   const PetscScalar *xa;
 67:   VecCUDAAllocateCheckHost(xin);
 68:   VecCUDAAllocateCheckHost(yin);
 69:   if (xin != yin) {
 70:     VecGetArrayRead(xin, &xa);
 71:     VecGetArray(yin, &ya);
 72:     PetscArraycpy(ya, xa, xin->map->n);
 73:     VecRestoreArrayRead(xin, &xa);
 74:     VecRestoreArray(yin, &ya);
 75:   }
 76:   return 0;
 77: }
 79: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin, PetscRandom r)
 80: {
 81:   PetscInt     n = xin->map->n;
 82:   PetscBool    iscurand;
 83:   PetscScalar *xx;
 85:   PetscObjectTypeCompare((PetscObject)r, PETSCCURAND, &iscurand);
 86:   if (iscurand) {
 87:     VecCUDAGetArrayWrite(xin, &xx);
 88:   } else {
 89:     VecGetArrayWrite(xin, &xx);
 90:   }
 91:   PetscRandomGetValues(r, n, xx);
 92:   if (iscurand) {
 93:     VecCUDARestoreArrayWrite(xin, &xx);
 94:   } else {
 95:     VecRestoreArrayWrite(xin, &xx);
 96:   }
 97:   return 0;
 98: }
100: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
101: {
102:   Vec_Seq *vs = (Vec_Seq *)v->data;
104:   PetscObjectSAWsViewOff(v);
105: #if defined(PETSC_USE_LOG)
106:   PetscLogObjectState((PetscObject)v, "Length=%" PetscInt_FMT, v->map->n);
107: #endif
108:   if (vs) {
109:     if (vs->array_allocated) {
110:       if (v->pinned_memory) PetscMallocSetCUDAHost();
111:       PetscFree(vs->array_allocated);
112:       if (v->pinned_memory) {
113:         PetscMallocResetCUDAHost();
114:         v->pinned_memory = PETSC_FALSE;
115:       }
116:     }
117:     VecDestroy_Seq(v);
118:   }
119:   return 0;
120: }
122: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
123: {
124:   Vec_Seq *v = (Vec_Seq *)vin->data;
126:   v->array         = v->unplacedarray;
127:   v->unplacedarray = 0;
128:   return 0;
129: }
131: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
132: {
133:   VecCUDACopyFromGPU(vin);
134:   VecResetArray_SeqCUDA_Private(vin);
135:   vin->offloadmask = PETSC_OFFLOAD_CPU;
136:   return 0;
137: }
139: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin, const PetscScalar *a)
140: {
141:   VecCUDACopyFromGPU(vin);
142:   VecPlaceArray_Seq(vin, a);
143:   vin->offloadmask = PETSC_OFFLOAD_CPU;
144:   return 0;
145: }
147: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin, const PetscScalar *a)
148: {
149:   Vec_Seq *vs = (Vec_Seq *)vin->data;
151:   if (vs->array != vs->array_allocated) {
152:     /* make sure the users array has the latest values */
153:     VecCUDACopyFromGPU(vin);
154:   }
155:   if (vs->array_allocated) {
156:     if (vin->pinned_memory) PetscMallocSetCUDAHost();
157:     PetscFree(vs->array_allocated);
158:     if (vin->pinned_memory) PetscMallocResetCUDAHost();
159:   }
160:   vin->pinned_memory  = PETSC_FALSE;
161:   vs->array_allocated = vs->array = (PetscScalar *)a;
162:   vin->offloadmask                = PETSC_OFFLOAD_CPU;
163:   return 0;
164: }
166: /*@
167:  VecCreateSeqCUDA - Creates a standard, sequential array-style vector.
169:  Collective
171:  Input Parameter:
172:  +  comm - the communicator, should be PETSC_COMM_SELF
173:  -  n - the vector length
175:  Output Parameter:
176:  .  v - the vector
178:  Notes:
179:  Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
180:  same type as an existing vector.
182:  Level: intermediate
184:  .seealso: `VecCreateMPICUDA()`, `VecCreateMPI()`, `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`
185:  @*/
186: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm, PetscInt n, Vec *v)
187: {
188:   VecCreate(comm, v);
189:   VecSetSizes(*v, n, n);
190:   VecSetType(*v, VECSEQCUDA);
191:   return 0;
192: }
194: PetscErrorCode VecDuplicate_SeqCUDA(Vec win, Vec *V)
195: {
196:   VecCreateSeqCUDA(PetscObjectComm((PetscObject)win), win->map->n, V);
197:   PetscLayoutReference(win->map, &(*V)->map);
198:   PetscObjectListDuplicate(((PetscObject)win)->olist, &((PetscObject)(*V))->olist);
199:   PetscFunctionListDuplicate(((PetscObject)win)->qlist, &((PetscObject)(*V))->qlist);
200:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
201:   return 0;
202: }
204: PetscErrorCode VecCreate_SeqCUDA(Vec V)
205: {
206:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
207:   PetscLayoutSetUp(V->map);
208:   VecCUDAAllocateCheck(V);
209:   VecCreate_SeqCUDA_Private(V, ((Vec_CUDA *)V->spptr)->GPUarray_allocated);
210:   VecSet_SeqCUDA(V, 0.0);
211:   return 0;
212: }
214: /*@C
215:    VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
216:    where the user provides the array space to store the vector values. The array
217:    provided must be a GPU array.
219:    Collective
221:    Input Parameters:
222: +  comm - the communicator, should be PETSC_COMM_SELF
223: .  bs - the block size
224: .  n - the vector length
225: -  array - GPU memory where the vector elements are to be stored.
227:    Output Parameter:
228: .  V - the vector
230:    Notes:
231:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
232:    same type as an existing vector.
234:    If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
235:    at a later stage to SET the array for storing the vector values.
237:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
238:    The user should not free the array until the vector is destroyed.
240:    Level: intermediate
242: .seealso: `VecCreateMPICUDAWithArray()`, `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`,
243:           `VecCreateGhost()`, `VecCreateSeq()`, `VecCUDAPlaceArray()`, `VecCreateSeqWithArray()`,
244:           `VecCreateMPIWithArray()`
245: @*/
246: PetscErrorCode VecCreateSeqCUDAWithArray(MPI_Comm comm, PetscInt bs, PetscInt n, const PetscScalar array[], Vec *V)
247: {
248:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
249:   VecCreate(comm, V);
250:   VecSetSizes(*V, n, n);
251:   VecSetBlockSize(*V, bs);
252:   VecCreate_SeqCUDA_Private(*V, array);
253:   return 0;
254: }
256: /*@C
257:    VecCreateSeqCUDAWithArrays - Creates a CUDA sequential array-style vector,
258:    where the user provides the array space to store the vector values.
260:    Collective
262:    Input Parameters:
263: +  comm - the communicator, should be PETSC_COMM_SELF
264: .  bs - the block size
265: .  n - the vector length
266: -  cpuarray - CPU memory where the vector elements are to be stored.
267: -  gpuarray - GPU memory where the vector elements are to be stored.
269:    Output Parameter:
270: .  V - the vector
272:    Notes:
273:    If both cpuarray and gpuarray are provided, the caller must ensure that
274:    the provided arrays have identical values.
276:    PETSc does NOT free the provided arrays when the vector is destroyed via
277:    VecDestroy(). The user should not free the array until the vector is
278:    destroyed.
280:    Level: intermediate
282: .seealso: `VecCreateMPICUDAWithArrays()`, `VecCreate()`, `VecCreateSeqWithArray()`,
283:           `VecCUDAPlaceArray()`, `VecCreateSeqCUDAWithArray()`,
284:           `VecCUDAAllocateCheckHost()`
285: @*/
286: PetscErrorCode VecCreateSeqCUDAWithArrays(MPI_Comm comm, PetscInt bs, PetscInt n, const PetscScalar cpuarray[], const PetscScalar gpuarray[], Vec *V)
287: {
288:   // set V's gpuarray to be gpuarray, do not allocate memory on host yet.
289:   VecCreateSeqCUDAWithArray(comm, bs, n, gpuarray, V);
291:   if (cpuarray && gpuarray) {
292:     Vec_Seq *s        = (Vec_Seq *)((*V)->data);
293:     s->array          = (PetscScalar *)cpuarray;
294:     (*V)->offloadmask = PETSC_OFFLOAD_BOTH;
295:   } else if (cpuarray) {
296:     Vec_Seq *s        = (Vec_Seq *)((*V)->data);
297:     s->array          = (PetscScalar *)cpuarray;
298:     (*V)->offloadmask = PETSC_OFFLOAD_CPU;
299:   } else if (gpuarray) {
300:     (*V)->offloadmask = PETSC_OFFLOAD_GPU;
301:   } else {
302:     (*V)->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
303:   }
305:   return 0;
306: }
308: PetscErrorCode VecGetArray_SeqCUDA(Vec v, PetscScalar **a)
309: {
310:   VecCUDACopyFromGPU(v);
311:   *a = *((PetscScalar **)v->data);
312:   return 0;
313: }
315: PetscErrorCode VecRestoreArray_SeqCUDA(Vec v, PetscScalar **a)
316: {
317:   v->offloadmask = PETSC_OFFLOAD_CPU;
318:   return 0;
319: }
321: PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec v, PetscScalar **a)
322: {
323:   VecCUDAAllocateCheckHost(v);
324:   *a = *((PetscScalar **)v->data);
325:   return 0;
326: }
328: PetscErrorCode VecGetArrayAndMemType_SeqCUDA(Vec v, PetscScalar **a, PetscMemType *mtype)
329: {
330:   VecCUDACopyToGPU(v);
331:   *a = ((Vec_CUDA *)v->spptr)->GPUarray;
332:   if (mtype) *mtype = ((Vec_CUDA *)v->spptr)->nvshmem ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUDA;
333:   return 0;
334: }
336: PetscErrorCode VecRestoreArrayAndMemType_SeqCUDA(Vec v, PetscScalar **a)
337: {
338:   v->offloadmask = PETSC_OFFLOAD_GPU;
339:   return 0;
340: }
342: PetscErrorCode VecGetArrayWriteAndMemType_SeqCUDA(Vec v, PetscScalar **a, PetscMemType *mtype)
343: {
344:   /* Allocate memory (not zeroed) on device if not yet, but no need to sync data from host to device */
345:   VecCUDAAllocateCheck(v);
346:   *a = ((Vec_CUDA *)v->spptr)->GPUarray;
347:   if (mtype) *mtype = ((Vec_CUDA *)v->spptr)->nvshmem ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUDA;
348:   return 0;
349: }
351: PetscErrorCode VecBindToCPU_SeqCUDA(Vec V, PetscBool bind)
352: {
353:   V->boundtocpu = bind;
354:   if (bind) {
355:     VecCUDACopyFromGPU(V);
356:     V->offloadmask                  = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
357:     V->ops->dot                     = VecDot_Seq;
358:     V->ops->norm                    = VecNorm_Seq;
359:     V->ops->tdot                    = VecTDot_Seq;
360:     V->ops->scale                   = VecScale_Seq;
361:     V->ops->copy                    = VecCopy_Seq;
362:     V->ops->set                     = VecSet_Seq;
363:     V->ops->swap                    = VecSwap_Seq;
364:     V->ops->axpy                    = VecAXPY_Seq;
365:     V->ops->axpby                   = VecAXPBY_Seq;
366:     V->ops->axpbypcz                = VecAXPBYPCZ_Seq;
367:     V->ops->pointwisemult           = VecPointwiseMult_Seq;
368:     V->ops->pointwisedivide         = VecPointwiseDivide_Seq;
369:     V->ops->setrandom               = VecSetRandom_Seq;
370:     V->ops->dot_local               = VecDot_Seq;
371:     V->ops->tdot_local              = VecTDot_Seq;
372:     V->ops->norm_local              = VecNorm_Seq;
373:     V->ops->mdot_local              = VecMDot_Seq;
374:     V->ops->mtdot_local             = VecMTDot_Seq;
375:     V->ops->maxpy                   = VecMAXPY_Seq;
376:     V->ops->mdot                    = VecMDot_Seq;
377:     V->ops->mtdot                   = VecMTDot_Seq;
378:     V->ops->aypx                    = VecAYPX_Seq;
379:     V->ops->waxpy                   = VecWAXPY_Seq;
380:     V->ops->dotnorm2                = NULL;
381:     V->ops->placearray              = VecPlaceArray_Seq;
382:     V->ops->replacearray            = VecReplaceArray_SeqCUDA;
383:     V->ops->resetarray              = VecResetArray_Seq;
384:     V->ops->duplicate               = VecDuplicate_Seq;
385:     V->ops->conjugate               = VecConjugate_Seq;
386:     V->ops->getlocalvector          = NULL;
387:     V->ops->restorelocalvector      = NULL;
388:     V->ops->getlocalvectorread      = NULL;
389:     V->ops->restorelocalvectorread  = NULL;
390:     V->ops->getarraywrite           = NULL;
391:     V->ops->getarrayandmemtype      = NULL;
392:     V->ops->getarraywriteandmemtype = NULL;
393:     V->ops->restorearrayandmemtype  = NULL;
394:     V->ops->max                     = VecMax_Seq;
395:     V->ops->min                     = VecMin_Seq;
396:     V->ops->reciprocal              = VecReciprocal_Default;
397:     V->ops->sum                     = NULL;
398:     V->ops->shift                   = NULL;
399:     V->ops->setpreallocationcoo     = VecSetPreallocationCOO_Seq;
400:     V->ops->setvaluescoo            = VecSetValuesCOO_Seq;
401:     /* default random number generator */
402:     PetscFree(V->defaultrandtype);
403:     PetscStrallocpy(PETSCRANDER48, &V->defaultrandtype);
404:   } else {
405:     V->ops->dot                     = VecDot_SeqCUDA;
406:     V->ops->norm                    = VecNorm_SeqCUDA;
407:     V->ops->tdot                    = VecTDot_SeqCUDA;
408:     V->ops->scale                   = VecScale_SeqCUDA;
409:     V->ops->copy                    = VecCopy_SeqCUDA;
410:     V->ops->set                     = VecSet_SeqCUDA;
411:     V->ops->swap                    = VecSwap_SeqCUDA;
412:     V->ops->axpy                    = VecAXPY_SeqCUDA;
413:     V->ops->axpby                   = VecAXPBY_SeqCUDA;
414:     V->ops->axpbypcz                = VecAXPBYPCZ_SeqCUDA;
415:     V->ops->pointwisemult           = VecPointwiseMult_SeqCUDA;
416:     V->ops->pointwisedivide         = VecPointwiseDivide_SeqCUDA;
417:     V->ops->setrandom               = VecSetRandom_SeqCUDA;
418:     V->ops->dot_local               = VecDot_SeqCUDA;
419:     V->ops->tdot_local              = VecTDot_SeqCUDA;
420:     V->ops->norm_local              = VecNorm_SeqCUDA;
421:     V->ops->mdot_local              = VecMDot_SeqCUDA;
422:     V->ops->maxpy                   = VecMAXPY_SeqCUDA;
423:     V->ops->mdot                    = VecMDot_SeqCUDA;
424:     V->ops->aypx                    = VecAYPX_SeqCUDA;
425:     V->ops->waxpy                   = VecWAXPY_SeqCUDA;
426:     V->ops->dotnorm2                = VecDotNorm2_SeqCUDA;
427:     V->ops->placearray              = VecPlaceArray_SeqCUDA;
428:     V->ops->replacearray            = VecReplaceArray_SeqCUDA;
429:     V->ops->resetarray              = VecResetArray_SeqCUDA;
430:     V->ops->destroy                 = VecDestroy_SeqCUDA;
431:     V->ops->duplicate               = VecDuplicate_SeqCUDA;
432:     V->ops->conjugate               = VecConjugate_SeqCUDA;
433:     V->ops->getlocalvector          = VecGetLocalVector_SeqCUDA;
434:     V->ops->restorelocalvector      = VecRestoreLocalVector_SeqCUDA;
435:     V->ops->getlocalvectorread      = VecGetLocalVectorRead_SeqCUDA;
436:     V->ops->restorelocalvectorread  = VecRestoreLocalVectorRead_SeqCUDA;
437:     V->ops->getarraywrite           = VecGetArrayWrite_SeqCUDA;
438:     V->ops->getarray                = VecGetArray_SeqCUDA;
439:     V->ops->restorearray            = VecRestoreArray_SeqCUDA;
440:     V->ops->getarrayandmemtype      = VecGetArrayAndMemType_SeqCUDA;
441:     V->ops->getarraywriteandmemtype = VecGetArrayWriteAndMemType_SeqCUDA;
442:     V->ops->restorearrayandmemtype  = VecRestoreArrayAndMemType_SeqCUDA;
443:     V->ops->max                     = VecMax_SeqCUDA;
444:     V->ops->min                     = VecMin_SeqCUDA;
445:     V->ops->reciprocal              = VecReciprocal_SeqCUDA;
446:     V->ops->sum                     = VecSum_SeqCUDA;
447:     V->ops->shift                   = VecShift_SeqCUDA;
448:     V->ops->setpreallocationcoo     = VecSetPreallocationCOO_SeqCUDA;
449:     V->ops->setvaluescoo            = VecSetValuesCOO_SeqCUDA;
451:     /* default random number generator */
452:     PetscFree(V->defaultrandtype);
453:     PetscStrallocpy(PETSCCURAND, &V->defaultrandtype);
454:   }
455:   return 0;
456: }
458: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V, const PetscScalar *array)
459: {
460:   Vec_CUDA   *veccuda;
461:   PetscMPIInt size;
462:   PetscBool   option_set;
464:   MPI_Comm_size(PetscObjectComm((PetscObject)V), &size);
466:   VecCreate_Seq_Private(V, 0);
467:   PetscObjectChangeTypeName((PetscObject)V, VECSEQCUDA);
468:   VecBindToCPU_SeqCUDA(V, PETSC_FALSE);
469:   V->ops->bindtocpu = VecBindToCPU_SeqCUDA;
471:   /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
472:   if (array) {
473:     if (!V->spptr) {
474:       PetscReal pinned_memory_min;
475:       PetscCalloc(sizeof(Vec_CUDA), &V->spptr);
476:       veccuda        = (Vec_CUDA *)V->spptr;
477:       V->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
479:       pinned_memory_min = 0;
480:       /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
481:          Note: This same code duplicated in VecCUDAAllocateCheck() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
482:       PetscOptionsBegin(PetscObjectComm((PetscObject)V), ((PetscObject)V)->prefix, "VECCUDA Options", "Vec");
483:       PetscOptionsReal("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", pinned_memory_min, &pinned_memory_min, &option_set);
484:       if (option_set) V->minimum_bytes_pinned_memory = pinned_memory_min;
485:       PetscOptionsEnd();
486:     }
487:     veccuda           = (Vec_CUDA *)V->spptr;
488:     veccuda->GPUarray = (PetscScalar *)array;
489:     V->offloadmask    = PETSC_OFFLOAD_GPU;
490:   }
491:   return 0;
492: }