CoCalc -- cuda_stream.cpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/core/src/cuda_stream.cpp
¹⁶³³⁷ views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
//  By downloading, copying, installing or using the software you agree to this license.
6
//  If you do not agree to this license, do not download, install,
7
//  copy or use the software.
8
//
9
//
10
//                           License Agreement
11
//                For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// Redistribution and use in source and binary forms, with or without modification,
18
// are permitted provided that the following conditions are met:
19
//
20
//   * Redistribution's of source code must retain the above copyright notice,
21
//     this list of conditions and the following disclaimer.
22
//
23
//   * Redistribution's in binary form must reproduce the above copyright notice,
24
//     this list of conditions and the following disclaimer in the documentation
25
//     and/or other materials provided with the distribution.
26
//
27
//   * The name of the copyright holders may not be used to endorse or promote products
28
//     derived from this software without specific prior written permission.
29
//
30
// This software is provided by the copyright holders and contributors "as is" and
31
// any express or implied warranties, including, but not limited to, the implied
32
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33
// In no event shall the Intel Corporation or contributors be liable for any direct,
34
// indirect, incidental, special, exemplary, or consequential damages
35
// (including, but not limited to, procurement of substitute goods or services;
36
// loss of use, data, or profits; or business interruption) however caused
37
// and on any theory of liability, whether in contract, strict liability,
38
// or tort (including negligence or otherwise) arising in any way out of
39
// the use of this software, even if advised of the possibility of such damage.
40
//
41
//M*/
42

43
#include "precomp.hpp"
44

45
using namespace cv;
46
using namespace cv::cuda;
47

48
#if defined(_MSC_VER)
49
#pragma warning(disable : 4702)  // unreachable code
50
#endif
51

52
/////////////////////////////////////////////////////////////
53
/// MemoryStack
54

55
#ifdef HAVE_CUDA
56

57
namespace
58
{
59
    class MemoryPool;
60

61
    class MemoryStack
62
    {
63
    public:
64
        uchar* requestMemory(size_t size);
65
        void returnMemory(uchar* ptr);
66

67
        uchar* datastart;
68
        uchar* dataend;
69
        uchar* tip;
70

71
        bool isFree;
72
        MemoryPool* pool;
73

74
    #if !defined(NDEBUG)
75
        std::vector<size_t> allocations;
76
    #endif
77
    };
78

79
    uchar* MemoryStack::requestMemory(size_t size)
80
    {
81
        const size_t freeMem = dataend - tip;
82

83
        if (size > freeMem)
84
            return 0;
85

86
        uchar* ptr = tip;
87

88
        tip += size;
89

90
    #if !defined(NDEBUG)
91
        allocations.push_back(size);
92
    #endif
93

94
        return ptr;
95
    }
96

97
    void MemoryStack::returnMemory(uchar* ptr)
98
    {
99
        CV_DbgAssert( ptr >= datastart && ptr < dataend );
100

101
    #if !defined(NDEBUG)
102
        const size_t allocSize = tip - ptr;
103
        CV_Assert( allocSize == allocations.back() );
104
        allocations.pop_back();
105
    #endif
106

107
        tip = ptr;
108
    }
109
}
110

111
#endif
112

113
/////////////////////////////////////////////////////////////
114
/// MemoryPool
115

116
#ifdef HAVE_CUDA
117

118
namespace
119
{
120
    class MemoryPool
121
    {
122
    public:
123
        MemoryPool();
124
        ~MemoryPool() { release(); }
125

126
        void initialize(size_t stackSize, int stackCount);
127
        void release();
128

129
        MemoryStack* getFreeMemStack();
130
        void returnMemStack(MemoryStack* memStack);
131

132
    private:
133
        void initilizeImpl();
134

135
        Mutex mtx_;
136

137
        bool initialized_;
138
        size_t stackSize_;
139
        int stackCount_;
140

141
        uchar* mem_;
142

143
        std::vector<MemoryStack> stacks_;
144

145
        MemoryPool(const MemoryPool&); //= delete;
146
    };
147

148
    MemoryPool::MemoryPool() : initialized_(false), mem_(0)
149
    {
150
        // default : 10 Mb, 5 stacks
151
        stackSize_ = 10 * 1024 * 1024;
152
        stackCount_ = 5;
153
    }
154

155
    void MemoryPool::initialize(size_t stackSize, int stackCount)
156
    {
157
        AutoLock lock(mtx_);
158

159
        release();
160

161
        stackSize_ = stackSize;
162
        stackCount_ = stackCount;
163

164
        initilizeImpl();
165
    }
166

167
    void MemoryPool::initilizeImpl()
168
    {
169
        const size_t totalSize = stackSize_ * stackCount_;
170

171
        if (totalSize > 0)
172
        {
173
            cudaError_t err = cudaMalloc(&mem_, totalSize);
174
            if (err != cudaSuccess)
175
                return;
176

177
            stacks_.resize(stackCount_);
178

179
            uchar* ptr = mem_;
180

181
            for (int i = 0; i < stackCount_; ++i)
182
            {
183
                stacks_[i].datastart = ptr;
184
                stacks_[i].dataend = ptr + stackSize_;
185
                stacks_[i].tip = ptr;
186
                stacks_[i].isFree = true;
187
                stacks_[i].pool = this;
188

189
                ptr += stackSize_;
190
            }
191

192
            initialized_ = true;
193
        }
194
    }
195

196
    void MemoryPool::release()
197
    {
198
        if (mem_)
199
        {
200
#if !defined(NDEBUG)
201
            for (int i = 0; i < stackCount_; ++i)
202
            {
203
                CV_DbgAssert( stacks_[i].isFree );
204
                CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
205
            }
206
#endif
207

208
            cudaFree(mem_);
209

210
            mem_ = 0;
211
            initialized_ = false;
212
        }
213
    }
214

215
    MemoryStack* MemoryPool::getFreeMemStack()
216
    {
217
        AutoLock lock(mtx_);
218

219
        if (!initialized_)
220
            initilizeImpl();
221

222
        if (!mem_)
223
            return 0;
224

225
        for (int i = 0; i < stackCount_; ++i)
226
        {
227
            if (stacks_[i].isFree)
228
            {
229
                stacks_[i].isFree = false;
230
                return &stacks_[i];
231
            }
232
        }
233

234
        return 0;
235
    }
236

237
    void MemoryPool::returnMemStack(MemoryStack* memStack)
238
    {
239
        AutoLock lock(mtx_);
240

241
        CV_DbgAssert( !memStack->isFree );
242

243
#if !defined(NDEBUG)
244
        bool found = false;
245
        for (int i = 0; i < stackCount_; ++i)
246
        {
247
            if (memStack == &stacks_[i])
248
            {
249
                found = true;
250
                break;
251
            }
252
        }
253
        CV_DbgAssert( found );
254
#endif
255

256
        CV_DbgAssert( memStack->tip == memStack->datastart );
257

258
        memStack->isFree = true;
259
    }
260
}
261

262
#endif
263

264
////////////////////////////////////////////////////////////////
265
/// Stream::Impl
266

267
#ifndef HAVE_CUDA
268

269
class cv::cuda::Stream::Impl
270
{
271
public:
272
    Impl(void* ptr = 0)
273
    {
274
        CV_UNUSED(ptr);
275
        throw_no_cuda();
276
    }
277
};
278

279
#else
280

281
namespace
282
{
283
    class StackAllocator;
284
}
285

286
class cv::cuda::Stream::Impl
287
{
288
public:
289
    cudaStream_t stream;
290
    bool ownStream;
291

292
    Ptr<GpuMat::Allocator> allocator;
293

294
    Impl();
295
    Impl(const Ptr<GpuMat::Allocator>& allocator);
296
    explicit Impl(cudaStream_t stream);
297

298
    ~Impl();
299
};
300

301
cv::cuda::Stream::Impl::Impl() : stream(0), ownStream(false)
302
{
303
    cudaSafeCall( cudaStreamCreate(&stream) );
304
    ownStream = true;
305

306
    allocator = makePtr<StackAllocator>(stream);
307
}
308

309
cv::cuda::Stream::Impl::Impl(const Ptr<GpuMat::Allocator>& allocator) : stream(0), ownStream(false), allocator(allocator)
310
{
311
    cudaSafeCall( cudaStreamCreate(&stream) );
312
    ownStream = true;
313
}
314

315
cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_), ownStream(false)
316
{
317
    allocator = makePtr<StackAllocator>(stream);
318
}
319

320
cv::cuda::Stream::Impl::~Impl()
321
{
322
    allocator.release();
323

324
    if (stream && ownStream)
325
    {
326
        cudaStreamDestroy(stream);
327
    }
328
}
329

330
#endif
331

332
/////////////////////////////////////////////////////////////
333
/// DefaultDeviceInitializer
334

335
#ifdef HAVE_CUDA
336

337
namespace cv { namespace cuda
338
{
339
    class DefaultDeviceInitializer
340
    {
341
    public:
342
        DefaultDeviceInitializer();
343
        ~DefaultDeviceInitializer();
344

345
        Stream& getNullStream(int deviceId);
346
        MemoryPool& getMemoryPool(int deviceId);
347

348
    private:
349
        void initStreams();
350
        void initPools();
351

352
        std::vector<Ptr<Stream> > streams_;
353
        Mutex streams_mtx_;
354

355
        std::vector<Ptr<MemoryPool> > pools_;
356
        Mutex pools_mtx_;
357
    };
358

359
    DefaultDeviceInitializer::DefaultDeviceInitializer()
360
    {
361
    }
362

363
    DefaultDeviceInitializer::~DefaultDeviceInitializer()
364
    {
365
        streams_.clear();
366

367
        for (size_t i = 0; i < pools_.size(); ++i)
368
        {
369
            cudaSetDevice(static_cast<int>(i));
370
            pools_[i]->release();
371
        }
372

373
        pools_.clear();
374
    }
375

376
    Stream& DefaultDeviceInitializer::getNullStream(int deviceId)
377
    {
378
        AutoLock lock(streams_mtx_);
379

380
        if (streams_.empty())
381
        {
382
            int deviceCount = getCudaEnabledDeviceCount();
383

384
            if (deviceCount > 0)
385
                streams_.resize(deviceCount);
386
        }
387

388
        CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(streams_.size()) );
389

390
        if (streams_[deviceId].empty())
391
        {
392
            cudaStream_t stream = NULL;
393
            Ptr<Stream::Impl> impl = makePtr<Stream::Impl>(stream);
394
            streams_[deviceId] = Ptr<Stream>(new Stream(impl));
395
        }
396

397
        return *streams_[deviceId];
398
    }
399

400
    MemoryPool& DefaultDeviceInitializer::getMemoryPool(int deviceId)
401
    {
402
        AutoLock lock(pools_mtx_);
403

404
        if (pools_.empty())
405
        {
406
            int deviceCount = getCudaEnabledDeviceCount();
407

408
            if (deviceCount > 0)
409
            {
410
                pools_.resize(deviceCount);
411
                for (size_t i = 0; i < pools_.size(); ++i)
412
                {
413
                    cudaSetDevice(static_cast<int>(i));
414
                    pools_[i] = makePtr<MemoryPool>();
415
                }
416
            }
417
        }
418

419
        CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
420

421
        MemoryPool* p = pools_[deviceId];
422
        CV_Assert(p);
423
        return *p;
424
    }
425

426
    DefaultDeviceInitializer initializer;
427
}}
428

429
#endif
430

431
/////////////////////////////////////////////////////////////
432
/// Stream
433

434
cv::cuda::Stream::Stream()
435
{
436
#ifndef HAVE_CUDA
437
    throw_no_cuda();
438
#else
439
    impl_ = makePtr<Impl>();
440
#endif
441
}
442

443
cv::cuda::Stream::Stream(const Ptr<GpuMat::Allocator>& allocator)
444
{
445
#ifndef HAVE_CUDA
446
    CV_UNUSED(allocator);
447
    throw_no_cuda();
448
#else
449
    impl_ = makePtr<Impl>(allocator);
450
#endif
451
}
452

453
bool cv::cuda::Stream::queryIfComplete() const
454
{
455
#ifndef HAVE_CUDA
456
    throw_no_cuda();
457
#else
458
    cudaError_t err = cudaStreamQuery(impl_->stream);
459

460
    if (err == cudaErrorNotReady || err == cudaSuccess)
461
        return err == cudaSuccess;
462

463
    cudaSafeCall(err);
464
    return false;
465
#endif
466
}
467

468
void cv::cuda::Stream::waitForCompletion()
469
{
470
#ifndef HAVE_CUDA
471
    throw_no_cuda();
472
#else
473
    cudaSafeCall( cudaStreamSynchronize(impl_->stream) );
474
#endif
475
}
476

477
void cv::cuda::Stream::waitEvent(const Event& event)
478
{
479
#ifndef HAVE_CUDA
480
    CV_UNUSED(event);
481
    throw_no_cuda();
482
#else
483
    cudaSafeCall( cudaStreamWaitEvent(impl_->stream, EventAccessor::getEvent(event), 0) );
484
#endif
485
}
486

487
#if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000)
488

489
namespace
490
{
491
    struct CallbackData
492
    {
493
        Stream::StreamCallback callback;
494
        void* userData;
495

496
        CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {}
497
    };
498

499
    void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData)
500
    {
501
        CallbackData* data = reinterpret_cast<CallbackData*>(userData);
502
        data->callback(static_cast<int>(status), data->userData);
503
        delete data;
504
    }
505
}
506

507
#endif
508

509
void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
510
{
511
#ifndef HAVE_CUDA
512
    CV_UNUSED(callback);
513
    CV_UNUSED(userData);
514
    throw_no_cuda();
515
#else
516
    #if CUDART_VERSION < 5000
517
        CV_UNUSED(callback);
518
        CV_UNUSED(userData);
519
        CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0");
520
    #else
521
        CallbackData* data = new CallbackData(callback, userData);
522

523
        cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) );
524
    #endif
525
#endif
526
}
527

528
Stream& cv::cuda::Stream::Null()
529
{
530
#ifndef HAVE_CUDA
531
    throw_no_cuda();
532
#else
533
    const int deviceId = getDevice();
534
    return initializer.getNullStream(deviceId);
535
#endif
536
}
537

538
cv::cuda::Stream::operator bool_type() const
539
{
540
#ifndef HAVE_CUDA
541
    return 0;
542
#else
543
    return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0;
544
#endif
545
}
546

547
#ifdef HAVE_CUDA
548

549
cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream)
550
{
551
    return stream.impl_->stream;
552
}
553

554
Stream cv::cuda::StreamAccessor::wrapStream(cudaStream_t stream)
555
{
556
    return Stream(makePtr<Stream::Impl>(stream));
557
}
558

559
#endif
560

561
/////////////////////////////////////////////////////////////
562
/// StackAllocator
563

564
#ifdef HAVE_CUDA
565

566
namespace
567
{
568
    bool enableMemoryPool = false;
569

570
    class StackAllocator : public GpuMat::Allocator
571
    {
572
    public:
573
        explicit StackAllocator(cudaStream_t stream);
574
        ~StackAllocator();
575

576
        bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize) CV_OVERRIDE;
577
        void free(GpuMat* mat) CV_OVERRIDE;
578

579
    private:
580
        StackAllocator(const StackAllocator&);
581
        StackAllocator& operator =(const StackAllocator&);
582

583
        cudaStream_t stream_;
584
        MemoryStack* memStack_;
585
        size_t alignment_;
586
    };
587

588
    StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0)
589
    {
590
        if (enableMemoryPool)
591
        {
592
            const int deviceId = getDevice();
593
            memStack_ = initializer.getMemoryPool(deviceId).getFreeMemStack();
594
            DeviceInfo devInfo(deviceId);
595
            alignment_ = devInfo.textureAlignment();
596
        }
597
    }
598

599
    StackAllocator::~StackAllocator()
600
    {
601
        if (memStack_ != 0)
602
        {
603
            cudaStreamSynchronize(stream_);
604
            memStack_->pool->returnMemStack(memStack_);
605
        }
606
    }
607

608
    size_t alignUp(size_t what, size_t alignment)
609
    {
610
        size_t alignMask = alignment-1;
611
        size_t inverseAlignMask = ~alignMask;
612
        size_t res = (what + alignMask) & inverseAlignMask;
613
        return res;
614
    }
615

616
    bool StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
617
    {
618
        if (memStack_ == 0)
619
            return false;
620

621
        size_t pitch, memSize;
622

623
        if (rows > 1 && cols > 1)
624
        {
625
            pitch = alignUp(cols * elemSize, alignment_);
626
            memSize = pitch * rows;
627
        }
628
        else
629
        {
630
            // Single row or single column must be continuous
631
            pitch = elemSize * cols;
632
            memSize = alignUp(elemSize * cols * rows, 64);
633
        }
634

635
        uchar* ptr = memStack_->requestMemory(memSize);
636

637
        if (ptr == 0)
638
            return false;
639

640
        mat->data = ptr;
641
        mat->step = pitch;
642
        mat->refcount = (int*) fastMalloc(sizeof(int));
643

644
        return true;
645
    }
646

647
    void StackAllocator::free(GpuMat* mat)
648
    {
649
        if (memStack_ == 0)
650
            return;
651

652
        memStack_->returnMemory(mat->datastart);
653
        fastFree(mat->refcount);
654
    }
655
}
656

657
#endif
658

659
/////////////////////////////////////////////////////////////
660
/// BufferPool
661

662
void cv::cuda::setBufferPoolUsage(bool on)
663
{
664
#ifndef HAVE_CUDA
665
    CV_UNUSED(on);
666
    throw_no_cuda();
667
#else
668
    enableMemoryPool = on;
669
#endif
670
}
671

672
void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount)
673
{
674
#ifndef HAVE_CUDA
675
    CV_UNUSED(deviceId);
676
    CV_UNUSED(stackSize);
677
    CV_UNUSED(stackCount);
678
    throw_no_cuda();
679
#else
680
    const int currentDevice = getDevice();
681

682
    if (deviceId >= 0)
683
    {
684
        setDevice(deviceId);
685
        initializer.getMemoryPool(deviceId).initialize(stackSize, stackCount);
686
    }
687
    else
688
    {
689
        const int deviceCount = getCudaEnabledDeviceCount();
690

691
        for (deviceId = 0; deviceId < deviceCount; ++deviceId)
692
        {
693
            setDevice(deviceId);
694
            initializer.getMemoryPool(deviceId).initialize(stackSize, stackCount);
695
        }
696
    }
697

698
    setDevice(currentDevice);
699
#endif
700
}
701

702
#ifndef HAVE_CUDA
703
cv::cuda::BufferPool::BufferPool(Stream& stream)
704
{
705
    CV_UNUSED(stream);
706
    throw_no_cuda();
707
}
708
#else
709
cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->allocator)
710
{
711
}
712
#endif
713

714
GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
715
{
716
#ifndef HAVE_CUDA
717
    CV_UNUSED(rows);
718
    CV_UNUSED(cols);
719
    CV_UNUSED(type);
720
    throw_no_cuda();
721
#else
722
    GpuMat buf(allocator_);
723
    buf.create(rows, cols, type);
724
    return buf;
725
#endif
726
}
727

728

729
////////////////////////////////////////////////////////////////
730
// Event
731

732
#ifndef HAVE_CUDA
733

734
class cv::cuda::Event::Impl
735
{
736
public:
737
    Impl(unsigned int)
738
    {
739
        throw_no_cuda();
740
    }
741
};
742

743
#else
744

745
class cv::cuda::Event::Impl
746
{
747
public:
748
    cudaEvent_t event;
749
    bool ownEvent;
750

751
    explicit Impl(unsigned int flags);
752
    explicit Impl(cudaEvent_t event);
753
    ~Impl();
754
};
755

756
cv::cuda::Event::Impl::Impl(unsigned int flags) : event(0), ownEvent(false)
757
{
758
    cudaSafeCall( cudaEventCreateWithFlags(&event, flags) );
759
    ownEvent = true;
760
}
761

762
cv::cuda::Event::Impl::Impl(cudaEvent_t e) : event(e), ownEvent(false)
763
{
764
}
765

766
cv::cuda::Event::Impl::~Impl()
767
{
768
    if (event && ownEvent)
769
    {
770
        cudaEventDestroy(event);
771
    }
772
}
773

774
cudaEvent_t cv::cuda::EventAccessor::getEvent(const Event& event)
775
{
776
    return event.impl_->event;
777
}
778

779
Event cv::cuda::EventAccessor::wrapEvent(cudaEvent_t event)
780
{
781
    return Event(makePtr<Event::Impl>(event));
782
}
783

784
#endif
785

786
cv::cuda::Event::Event(CreateFlags flags)
787
{
788
#ifndef HAVE_CUDA
789
    CV_UNUSED(flags);
790
    throw_no_cuda();
791
#else
792
    impl_ = makePtr<Impl>(flags);
793
#endif
794
}
795

796
void cv::cuda::Event::record(Stream& stream)
797
{
798
#ifndef HAVE_CUDA
799
    CV_UNUSED(stream);
800
    throw_no_cuda();
801
#else
802
    cudaSafeCall( cudaEventRecord(impl_->event, StreamAccessor::getStream(stream)) );
803
#endif
804
}
805

806
bool cv::cuda::Event::queryIfComplete() const
807
{
808
#ifndef HAVE_CUDA
809
    throw_no_cuda();
810
#else
811
    cudaError_t err = cudaEventQuery(impl_->event);
812

813
    if (err == cudaErrorNotReady || err == cudaSuccess)
814
        return err == cudaSuccess;
815

816
    cudaSafeCall(err);
817
    return false;
818
#endif
819
}
820

821
void cv::cuda::Event::waitForCompletion()
822
{
823
#ifndef HAVE_CUDA
824
    throw_no_cuda();
825
#else
826
    cudaSafeCall( cudaEventSynchronize(impl_->event) );
827
#endif
828
}
829

830
float cv::cuda::Event::elapsedTime(const Event& start, const Event& end)
831
{
832
#ifndef HAVE_CUDA
833
    CV_UNUSED(start);
834
    CV_UNUSED(end);
835
    throw_no_cuda();
836
#else
837
    float ms;
838
    cudaSafeCall( cudaEventElapsedTime(&ms, start.impl_->event, end.impl_->event) );
839
    return ms;
840
#endif
841
}
842

843
Product

Resources

Company