Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/core/src/cuda_stream.cpp
16337 views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
8
//
9
//
10
// License Agreement
11
// For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// Redistribution and use in source and binary forms, with or without modification,
18
// are permitted provided that the following conditions are met:
19
//
20
// * Redistribution's of source code must retain the above copyright notice,
21
// this list of conditions and the following disclaimer.
22
//
23
// * Redistribution's in binary form must reproduce the above copyright notice,
24
// this list of conditions and the following disclaimer in the documentation
25
// and/or other materials provided with the distribution.
26
//
27
// * The name of the copyright holders may not be used to endorse or promote products
28
// derived from this software without specific prior written permission.
29
//
30
// This software is provided by the copyright holders and contributors "as is" and
31
// any express or implied warranties, including, but not limited to, the implied
32
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33
// In no event shall the Intel Corporation or contributors be liable for any direct,
34
// indirect, incidental, special, exemplary, or consequential damages
35
// (including, but not limited to, procurement of substitute goods or services;
36
// loss of use, data, or profits; or business interruption) however caused
37
// and on any theory of liability, whether in contract, strict liability,
38
// or tort (including negligence or otherwise) arising in any way out of
39
// the use of this software, even if advised of the possibility of such damage.
40
//
41
//M*/
42
43
#include "precomp.hpp"
44
45
using namespace cv;
46
using namespace cv::cuda;
47
48
#if defined(_MSC_VER)
49
#pragma warning(disable : 4702) // unreachable code
50
#endif
51
52
/////////////////////////////////////////////////////////////
53
/// MemoryStack
54
55
#ifdef HAVE_CUDA
56
57
namespace
58
{
59
class MemoryPool;
60
61
class MemoryStack
62
{
63
public:
64
uchar* requestMemory(size_t size);
65
void returnMemory(uchar* ptr);
66
67
uchar* datastart;
68
uchar* dataend;
69
uchar* tip;
70
71
bool isFree;
72
MemoryPool* pool;
73
74
#if !defined(NDEBUG)
75
std::vector<size_t> allocations;
76
#endif
77
};
78
79
uchar* MemoryStack::requestMemory(size_t size)
80
{
81
const size_t freeMem = dataend - tip;
82
83
if (size > freeMem)
84
return 0;
85
86
uchar* ptr = tip;
87
88
tip += size;
89
90
#if !defined(NDEBUG)
91
allocations.push_back(size);
92
#endif
93
94
return ptr;
95
}
96
97
void MemoryStack::returnMemory(uchar* ptr)
98
{
99
CV_DbgAssert( ptr >= datastart && ptr < dataend );
100
101
#if !defined(NDEBUG)
102
const size_t allocSize = tip - ptr;
103
CV_Assert( allocSize == allocations.back() );
104
allocations.pop_back();
105
#endif
106
107
tip = ptr;
108
}
109
}
110
111
#endif
112
113
/////////////////////////////////////////////////////////////
114
/// MemoryPool
115
116
#ifdef HAVE_CUDA
117
118
namespace
119
{
120
class MemoryPool
121
{
122
public:
123
MemoryPool();
124
~MemoryPool() { release(); }
125
126
void initialize(size_t stackSize, int stackCount);
127
void release();
128
129
MemoryStack* getFreeMemStack();
130
void returnMemStack(MemoryStack* memStack);
131
132
private:
133
void initilizeImpl();
134
135
Mutex mtx_;
136
137
bool initialized_;
138
size_t stackSize_;
139
int stackCount_;
140
141
uchar* mem_;
142
143
std::vector<MemoryStack> stacks_;
144
145
MemoryPool(const MemoryPool&); //= delete;
146
};
147
148
MemoryPool::MemoryPool() : initialized_(false), mem_(0)
149
{
150
// default : 10 Mb, 5 stacks
151
stackSize_ = 10 * 1024 * 1024;
152
stackCount_ = 5;
153
}
154
155
void MemoryPool::initialize(size_t stackSize, int stackCount)
156
{
157
AutoLock lock(mtx_);
158
159
release();
160
161
stackSize_ = stackSize;
162
stackCount_ = stackCount;
163
164
initilizeImpl();
165
}
166
167
void MemoryPool::initilizeImpl()
168
{
169
const size_t totalSize = stackSize_ * stackCount_;
170
171
if (totalSize > 0)
172
{
173
cudaError_t err = cudaMalloc(&mem_, totalSize);
174
if (err != cudaSuccess)
175
return;
176
177
stacks_.resize(stackCount_);
178
179
uchar* ptr = mem_;
180
181
for (int i = 0; i < stackCount_; ++i)
182
{
183
stacks_[i].datastart = ptr;
184
stacks_[i].dataend = ptr + stackSize_;
185
stacks_[i].tip = ptr;
186
stacks_[i].isFree = true;
187
stacks_[i].pool = this;
188
189
ptr += stackSize_;
190
}
191
192
initialized_ = true;
193
}
194
}
195
196
void MemoryPool::release()
197
{
198
if (mem_)
199
{
200
#if !defined(NDEBUG)
201
for (int i = 0; i < stackCount_; ++i)
202
{
203
CV_DbgAssert( stacks_[i].isFree );
204
CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
205
}
206
#endif
207
208
cudaFree(mem_);
209
210
mem_ = 0;
211
initialized_ = false;
212
}
213
}
214
215
MemoryStack* MemoryPool::getFreeMemStack()
216
{
217
AutoLock lock(mtx_);
218
219
if (!initialized_)
220
initilizeImpl();
221
222
if (!mem_)
223
return 0;
224
225
for (int i = 0; i < stackCount_; ++i)
226
{
227
if (stacks_[i].isFree)
228
{
229
stacks_[i].isFree = false;
230
return &stacks_[i];
231
}
232
}
233
234
return 0;
235
}
236
237
void MemoryPool::returnMemStack(MemoryStack* memStack)
238
{
239
AutoLock lock(mtx_);
240
241
CV_DbgAssert( !memStack->isFree );
242
243
#if !defined(NDEBUG)
244
bool found = false;
245
for (int i = 0; i < stackCount_; ++i)
246
{
247
if (memStack == &stacks_[i])
248
{
249
found = true;
250
break;
251
}
252
}
253
CV_DbgAssert( found );
254
#endif
255
256
CV_DbgAssert( memStack->tip == memStack->datastart );
257
258
memStack->isFree = true;
259
}
260
}
261
262
#endif
263
264
////////////////////////////////////////////////////////////////
265
/// Stream::Impl
266
267
#ifndef HAVE_CUDA
268
269
class cv::cuda::Stream::Impl
270
{
271
public:
272
Impl(void* ptr = 0)
273
{
274
CV_UNUSED(ptr);
275
throw_no_cuda();
276
}
277
};
278
279
#else
280
281
namespace
282
{
283
class StackAllocator;
284
}
285
286
class cv::cuda::Stream::Impl
287
{
288
public:
289
cudaStream_t stream;
290
bool ownStream;
291
292
Ptr<GpuMat::Allocator> allocator;
293
294
Impl();
295
Impl(const Ptr<GpuMat::Allocator>& allocator);
296
explicit Impl(cudaStream_t stream);
297
298
~Impl();
299
};
300
301
cv::cuda::Stream::Impl::Impl() : stream(0), ownStream(false)
302
{
303
cudaSafeCall( cudaStreamCreate(&stream) );
304
ownStream = true;
305
306
allocator = makePtr<StackAllocator>(stream);
307
}
308
309
cv::cuda::Stream::Impl::Impl(const Ptr<GpuMat::Allocator>& allocator) : stream(0), ownStream(false), allocator(allocator)
310
{
311
cudaSafeCall( cudaStreamCreate(&stream) );
312
ownStream = true;
313
}
314
315
cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_), ownStream(false)
316
{
317
allocator = makePtr<StackAllocator>(stream);
318
}
319
320
cv::cuda::Stream::Impl::~Impl()
321
{
322
allocator.release();
323
324
if (stream && ownStream)
325
{
326
cudaStreamDestroy(stream);
327
}
328
}
329
330
#endif
331
332
/////////////////////////////////////////////////////////////
333
/// DefaultDeviceInitializer
334
335
#ifdef HAVE_CUDA
336
337
namespace cv { namespace cuda
338
{
339
class DefaultDeviceInitializer
340
{
341
public:
342
DefaultDeviceInitializer();
343
~DefaultDeviceInitializer();
344
345
Stream& getNullStream(int deviceId);
346
MemoryPool& getMemoryPool(int deviceId);
347
348
private:
349
void initStreams();
350
void initPools();
351
352
std::vector<Ptr<Stream> > streams_;
353
Mutex streams_mtx_;
354
355
std::vector<Ptr<MemoryPool> > pools_;
356
Mutex pools_mtx_;
357
};
358
359
DefaultDeviceInitializer::DefaultDeviceInitializer()
360
{
361
}
362
363
DefaultDeviceInitializer::~DefaultDeviceInitializer()
364
{
365
streams_.clear();
366
367
for (size_t i = 0; i < pools_.size(); ++i)
368
{
369
cudaSetDevice(static_cast<int>(i));
370
pools_[i]->release();
371
}
372
373
pools_.clear();
374
}
375
376
Stream& DefaultDeviceInitializer::getNullStream(int deviceId)
377
{
378
AutoLock lock(streams_mtx_);
379
380
if (streams_.empty())
381
{
382
int deviceCount = getCudaEnabledDeviceCount();
383
384
if (deviceCount > 0)
385
streams_.resize(deviceCount);
386
}
387
388
CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(streams_.size()) );
389
390
if (streams_[deviceId].empty())
391
{
392
cudaStream_t stream = NULL;
393
Ptr<Stream::Impl> impl = makePtr<Stream::Impl>(stream);
394
streams_[deviceId] = Ptr<Stream>(new Stream(impl));
395
}
396
397
return *streams_[deviceId];
398
}
399
400
MemoryPool& DefaultDeviceInitializer::getMemoryPool(int deviceId)
401
{
402
AutoLock lock(pools_mtx_);
403
404
if (pools_.empty())
405
{
406
int deviceCount = getCudaEnabledDeviceCount();
407
408
if (deviceCount > 0)
409
{
410
pools_.resize(deviceCount);
411
for (size_t i = 0; i < pools_.size(); ++i)
412
{
413
cudaSetDevice(static_cast<int>(i));
414
pools_[i] = makePtr<MemoryPool>();
415
}
416
}
417
}
418
419
CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
420
421
MemoryPool* p = pools_[deviceId];
422
CV_Assert(p);
423
return *p;
424
}
425
426
DefaultDeviceInitializer initializer;
427
}}
428
429
#endif
430
431
/////////////////////////////////////////////////////////////
432
/// Stream
433
434
cv::cuda::Stream::Stream()
435
{
436
#ifndef HAVE_CUDA
437
throw_no_cuda();
438
#else
439
impl_ = makePtr<Impl>();
440
#endif
441
}
442
443
cv::cuda::Stream::Stream(const Ptr<GpuMat::Allocator>& allocator)
444
{
445
#ifndef HAVE_CUDA
446
CV_UNUSED(allocator);
447
throw_no_cuda();
448
#else
449
impl_ = makePtr<Impl>(allocator);
450
#endif
451
}
452
453
bool cv::cuda::Stream::queryIfComplete() const
454
{
455
#ifndef HAVE_CUDA
456
throw_no_cuda();
457
#else
458
cudaError_t err = cudaStreamQuery(impl_->stream);
459
460
if (err == cudaErrorNotReady || err == cudaSuccess)
461
return err == cudaSuccess;
462
463
cudaSafeCall(err);
464
return false;
465
#endif
466
}
467
468
void cv::cuda::Stream::waitForCompletion()
469
{
470
#ifndef HAVE_CUDA
471
throw_no_cuda();
472
#else
473
cudaSafeCall( cudaStreamSynchronize(impl_->stream) );
474
#endif
475
}
476
477
void cv::cuda::Stream::waitEvent(const Event& event)
478
{
479
#ifndef HAVE_CUDA
480
CV_UNUSED(event);
481
throw_no_cuda();
482
#else
483
cudaSafeCall( cudaStreamWaitEvent(impl_->stream, EventAccessor::getEvent(event), 0) );
484
#endif
485
}
486
487
#if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000)
488
489
namespace
490
{
491
struct CallbackData
492
{
493
Stream::StreamCallback callback;
494
void* userData;
495
496
CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {}
497
};
498
499
void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData)
500
{
501
CallbackData* data = reinterpret_cast<CallbackData*>(userData);
502
data->callback(static_cast<int>(status), data->userData);
503
delete data;
504
}
505
}
506
507
#endif
508
509
void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
510
{
511
#ifndef HAVE_CUDA
512
CV_UNUSED(callback);
513
CV_UNUSED(userData);
514
throw_no_cuda();
515
#else
516
#if CUDART_VERSION < 5000
517
CV_UNUSED(callback);
518
CV_UNUSED(userData);
519
CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0");
520
#else
521
CallbackData* data = new CallbackData(callback, userData);
522
523
cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) );
524
#endif
525
#endif
526
}
527
528
Stream& cv::cuda::Stream::Null()
529
{
530
#ifndef HAVE_CUDA
531
throw_no_cuda();
532
#else
533
const int deviceId = getDevice();
534
return initializer.getNullStream(deviceId);
535
#endif
536
}
537
538
cv::cuda::Stream::operator bool_type() const
539
{
540
#ifndef HAVE_CUDA
541
return 0;
542
#else
543
return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0;
544
#endif
545
}
546
547
#ifdef HAVE_CUDA
548
549
cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream)
550
{
551
return stream.impl_->stream;
552
}
553
554
Stream cv::cuda::StreamAccessor::wrapStream(cudaStream_t stream)
555
{
556
return Stream(makePtr<Stream::Impl>(stream));
557
}
558
559
#endif
560
561
/////////////////////////////////////////////////////////////
562
/// StackAllocator
563
564
#ifdef HAVE_CUDA
565
566
namespace
567
{
568
bool enableMemoryPool = false;
569
570
class StackAllocator : public GpuMat::Allocator
571
{
572
public:
573
explicit StackAllocator(cudaStream_t stream);
574
~StackAllocator();
575
576
bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize) CV_OVERRIDE;
577
void free(GpuMat* mat) CV_OVERRIDE;
578
579
private:
580
StackAllocator(const StackAllocator&);
581
StackAllocator& operator =(const StackAllocator&);
582
583
cudaStream_t stream_;
584
MemoryStack* memStack_;
585
size_t alignment_;
586
};
587
588
StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0)
589
{
590
if (enableMemoryPool)
591
{
592
const int deviceId = getDevice();
593
memStack_ = initializer.getMemoryPool(deviceId).getFreeMemStack();
594
DeviceInfo devInfo(deviceId);
595
alignment_ = devInfo.textureAlignment();
596
}
597
}
598
599
StackAllocator::~StackAllocator()
600
{
601
if (memStack_ != 0)
602
{
603
cudaStreamSynchronize(stream_);
604
memStack_->pool->returnMemStack(memStack_);
605
}
606
}
607
608
size_t alignUp(size_t what, size_t alignment)
609
{
610
size_t alignMask = alignment-1;
611
size_t inverseAlignMask = ~alignMask;
612
size_t res = (what + alignMask) & inverseAlignMask;
613
return res;
614
}
615
616
bool StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
617
{
618
if (memStack_ == 0)
619
return false;
620
621
size_t pitch, memSize;
622
623
if (rows > 1 && cols > 1)
624
{
625
pitch = alignUp(cols * elemSize, alignment_);
626
memSize = pitch * rows;
627
}
628
else
629
{
630
// Single row or single column must be continuous
631
pitch = elemSize * cols;
632
memSize = alignUp(elemSize * cols * rows, 64);
633
}
634
635
uchar* ptr = memStack_->requestMemory(memSize);
636
637
if (ptr == 0)
638
return false;
639
640
mat->data = ptr;
641
mat->step = pitch;
642
mat->refcount = (int*) fastMalloc(sizeof(int));
643
644
return true;
645
}
646
647
void StackAllocator::free(GpuMat* mat)
648
{
649
if (memStack_ == 0)
650
return;
651
652
memStack_->returnMemory(mat->datastart);
653
fastFree(mat->refcount);
654
}
655
}
656
657
#endif
658
659
/////////////////////////////////////////////////////////////
660
/// BufferPool
661
662
void cv::cuda::setBufferPoolUsage(bool on)
663
{
664
#ifndef HAVE_CUDA
665
CV_UNUSED(on);
666
throw_no_cuda();
667
#else
668
enableMemoryPool = on;
669
#endif
670
}
671
672
void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount)
673
{
674
#ifndef HAVE_CUDA
675
CV_UNUSED(deviceId);
676
CV_UNUSED(stackSize);
677
CV_UNUSED(stackCount);
678
throw_no_cuda();
679
#else
680
const int currentDevice = getDevice();
681
682
if (deviceId >= 0)
683
{
684
setDevice(deviceId);
685
initializer.getMemoryPool(deviceId).initialize(stackSize, stackCount);
686
}
687
else
688
{
689
const int deviceCount = getCudaEnabledDeviceCount();
690
691
for (deviceId = 0; deviceId < deviceCount; ++deviceId)
692
{
693
setDevice(deviceId);
694
initializer.getMemoryPool(deviceId).initialize(stackSize, stackCount);
695
}
696
}
697
698
setDevice(currentDevice);
699
#endif
700
}
701
702
#ifndef HAVE_CUDA
703
cv::cuda::BufferPool::BufferPool(Stream& stream)
704
{
705
CV_UNUSED(stream);
706
throw_no_cuda();
707
}
708
#else
709
cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->allocator)
710
{
711
}
712
#endif
713
714
GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
715
{
716
#ifndef HAVE_CUDA
717
CV_UNUSED(rows);
718
CV_UNUSED(cols);
719
CV_UNUSED(type);
720
throw_no_cuda();
721
#else
722
GpuMat buf(allocator_);
723
buf.create(rows, cols, type);
724
return buf;
725
#endif
726
}
727
728
729
////////////////////////////////////////////////////////////////
730
// Event
731
732
#ifndef HAVE_CUDA
733
734
class cv::cuda::Event::Impl
735
{
736
public:
737
Impl(unsigned int)
738
{
739
throw_no_cuda();
740
}
741
};
742
743
#else
744
745
class cv::cuda::Event::Impl
746
{
747
public:
748
cudaEvent_t event;
749
bool ownEvent;
750
751
explicit Impl(unsigned int flags);
752
explicit Impl(cudaEvent_t event);
753
~Impl();
754
};
755
756
cv::cuda::Event::Impl::Impl(unsigned int flags) : event(0), ownEvent(false)
757
{
758
cudaSafeCall( cudaEventCreateWithFlags(&event, flags) );
759
ownEvent = true;
760
}
761
762
cv::cuda::Event::Impl::Impl(cudaEvent_t e) : event(e), ownEvent(false)
763
{
764
}
765
766
cv::cuda::Event::Impl::~Impl()
767
{
768
if (event && ownEvent)
769
{
770
cudaEventDestroy(event);
771
}
772
}
773
774
cudaEvent_t cv::cuda::EventAccessor::getEvent(const Event& event)
775
{
776
return event.impl_->event;
777
}
778
779
Event cv::cuda::EventAccessor::wrapEvent(cudaEvent_t event)
780
{
781
return Event(makePtr<Event::Impl>(event));
782
}
783
784
#endif
785
786
cv::cuda::Event::Event(CreateFlags flags)
787
{
788
#ifndef HAVE_CUDA
789
CV_UNUSED(flags);
790
throw_no_cuda();
791
#else
792
impl_ = makePtr<Impl>(flags);
793
#endif
794
}
795
796
void cv::cuda::Event::record(Stream& stream)
797
{
798
#ifndef HAVE_CUDA
799
CV_UNUSED(stream);
800
throw_no_cuda();
801
#else
802
cudaSafeCall( cudaEventRecord(impl_->event, StreamAccessor::getStream(stream)) );
803
#endif
804
}
805
806
bool cv::cuda::Event::queryIfComplete() const
807
{
808
#ifndef HAVE_CUDA
809
throw_no_cuda();
810
#else
811
cudaError_t err = cudaEventQuery(impl_->event);
812
813
if (err == cudaErrorNotReady || err == cudaSuccess)
814
return err == cudaSuccess;
815
816
cudaSafeCall(err);
817
return false;
818
#endif
819
}
820
821
void cv::cuda::Event::waitForCompletion()
822
{
823
#ifndef HAVE_CUDA
824
throw_no_cuda();
825
#else
826
cudaSafeCall( cudaEventSynchronize(impl_->event) );
827
#endif
828
}
829
830
float cv::cuda::Event::elapsedTime(const Event& start, const Event& end)
831
{
832
#ifndef HAVE_CUDA
833
CV_UNUSED(start);
834
CV_UNUSED(end);
835
throw_no_cuda();
836
#else
837
float ms;
838
cudaSafeCall( cudaEventElapsedTime(&ms, start.impl_->event, end.impl_->event) );
839
return ms;
840
#endif
841
}
842
843