With GCC 4.4, I tested how atomic types and operations of c++0x work by implementing lock-free reference counting smart pointer.
TsPersistentSmartPointer.h
Code:
| #ifndef PERSISTENT_SMART_POINTER |
| #define PERSISTENT_SMART_POINTER |
| |
| #include <cstdatomic> |
| |
| class Counter { |
| public: |
| Counter():i(1){} |
| ~Counter(){} |
| |
| void increment(){ |
| i.fetch_add( 1, std::memory_order_relaxed ); |
| } |
| int decrement(){ |
| return i.fetch_sub( 1, std::memory_order_relaxed ); |
| } |
| void reset(){ |
| i.store(0, std::memory_order_relaxed); |
| } |
| private: |
| std::atomic_uint i; |
| }; |
| |
| template <class T> |
| class TsPersistentSmartPointer { |
| public: |
| explicit TsPersistentSmartPointer(T* pointee) : pointee_(pointee), counter(new Counter()){ |
| } |
| TsPersistentSmartPointer() : pointee_(0), counter(new Counter()){ |
| }; // for null construction, has to be copied from somewhere else |
| ~TsPersistentSmartPointer(){ |
| if(counter->decrement() == 0){ |
| delete counter; |
| delete pointee_; |
| } |
| } |
| |
| T& operator*() const{ |
| return *pointee_; |
| } |
| |
| T* operator->() const{ |
| return pointee_; |
| } |
| |
| // assignment |
| TsPersistentSmartPointer(const TsPersistentSmartPointer& src){ |
| pointee_ = src.pointee_; |
| counter = src.counter; |
| counter->increment(); |
| //src.pointee_ = 0; // NOT DESTRUCTIVE |
| } |
| TsPersistentSmartPointer& operator=(const TsPersistentSmartPointer& src){ |
| if(this != &src){ |
| if(counter->decrement() == 0){ |
| delete counter; |
| if(pointee_) delete pointee_; |
| } |
| pointee_ = src.pointee_; |
| //src.pointee_ = 0; // NOT DESTRUCTIVE |
| counter = src.counter; |
| counter->increment(); |
| } |
| return *this; |
| } |
| TsPersistentSmartPointer& operator=(T* pointee){ |
| if(pointee_ != pointee){ |
| if(pointee_) |
| delete pointee_; |
| pointee_ = pointee; |
| counter->reset(); |
| } |
| return *this; |
| } |
| |
| // equality |
| inline friend bool operator==(const TsPersistentSmartPointer& psp, const T* pointee){ |
| return psp.pointee_ == pointee; |
| } |
| inline friend bool operator==(const T* pointee, const TsPersistentSmartPointer& psp){ |
| return pointee == psp.pointee_; |
| } |
| // inequality |
| inline friend bool operator!=(const TsPersistentSmartPointer& psp, const T* pointee){ |
| return psp.pointee_ != pointee; |
| } |
| inline friend bool operator!=(const T* pointee, const TsPersistentSmartPointer& psp){ |
| return pointee != psp.pointee_; |
| } |
| |
| private: |
| T* pointee_; |
| Counter* counter; |
| }; |
| |
| #endif |
Atomic operations are implemented inside Counter class. The point is the following part, the destructor of TsPersistentSmartPointer:
Code:
| if(counter->decrement() == 0){ |
| delete counter; |
| delete pointee_; |
| } |
At first, I did the following, which resulted in Segmentation Fault under a million of concurrent destructions.
Code:
| counter->decrement(); |
| if(counter->get() == 0){ |
| delete counter; |
| delete pointee_; |
| } |
The problem could happen when the last two enters concurrently.
Thread1: counter->decrement(); // i = 1
Thread2: counter->decrement(); // i = 0
Thread2: counter->get() == 0 // TRUE
Thread1: counter->get() == 0 // TRUE!
Thread1: delete counter;
Thread2: delete counter; // OUCH!
Here're some more code for testing.
TsPersistentSmartPointer.cpp
Code:
| #include "TsPersistentSmartPointer.h" |
| #include <iostream> |
| |
| class Foo{ |
| public: |
| Foo(int id) : id_(id){std::cout << id_ << ":constructer called" << std::endl;} |
| ~Foo(){std::cout << id_ << ":destructer called" << std::endl;} |
| |
| void SayHello(){ |
| std::cout << "Hello, this is " << id_ << std::endl; |
| } |
| private: |
| int id_; |
| }; |
| |
| int main(){ |
| |
| Foo f0(0); |
| Foo *f1 = new Foo(1); |
| Foo *f2 = new Foo(2); // memory leak |
| Foo *f3 = new Foo(3); |
| TsPersistentSmartPointer<Foo> psp(0); |
| |
| // copy |
| psp = f3; |
| |
| // dereference and indirection |
| psp->SayHello(); |
| (*psp).SayHello(); |
| |
| // equality |
| std::cout << "equality works? :" << ((psp == f3) & (f3 == psp)) << std::endl; |
| |
| // copy constructor |
| TsPersistentSmartPointer<Foo> pspCopied(psp); |
| // dereference and indirection |
| pspCopied->SayHello(); |
| (*pspCopied).SayHello(); |
| |
| // see how destructors are called |
| delete f1; |
| |
| return 0; |
| } |
TsPSPIntSLList.h
Code:
| #ifndef INT_SINGLY_LINKEDLIST |
| #define INT_SINGLY_LINKEDLIST |
| |
| #include "TsPersistentSmartPointer.h" |
| |
| class TsPSPIntSLLNode { |
| |
| private: |
| int index; |
| long int payload[4]; |
| TsPersistentSmartPointer<TsPSPIntSLLNode> next; |
| |
| TsPSPIntSLLNode(int ix){ |
| index = ix; |
| } |
| |
| public: |
| int hasNext(){return next != 0;} |
| TsPersistentSmartPointer<TsPSPIntSLLNode>& moveNext(){return next;} |
| int getIndex(){return index;} |
| friend class TsPSPIntSLList; |
| }; |
| |
| class TsPSPIntSLList { |
| private: |
| TsPersistentSmartPointer<TsPSPIntSLLNode> head, tail; |
| public: |
| ~TsPSPIntSLList(); |
| |
| void addNode(int index); |
| TsPersistentSmartPointer<TsPSPIntSLLNode>& getHeadNode(){return head;} |
| }; |
| |
| #endif |
TsPSPIntSLList.cpp
Code:
| #include <iostream> |
| #include <time.h> |
| #include "TsPSPIntSLList.h" |
| |
| TsPSPIntSLList::~TsPSPIntSLList(){ |
| // no need iterative deletion |
| } |
| |
| void TsPSPIntSLList::addNode(int index){ |
| TsPersistentSmartPointer<TsPSPIntSLLNode> n(new TsPSPIntSLLNode(index)); |
| if(head == 0){ |
| head = n; |
| tail = n; |
| } |
| else |
| { |
| tail->next = n; |
| tail = n; |
| } |
| } |
| |
| int main(){ |
| |
| std::cout << "sizeof(TsPSPIntSLLNode) = " << sizeof(TsPSPIntSLLNode) << std::endl; |
| |
| clock_t start, end; |
| |
| std::cout << "creating huge large TsPSPIntSLList" << std::endl; |
| start = clock(); |
| TsPSPIntSLList huge; |
| std::cout << "creating huge TsPSPIntSLListNode" << std::endl; |
| // 157160: segmetation fault boarder when load was evaluated |
| for(int i=0; i<1000000; i++){ |
| huge.addNode(i); |
| // n, temporal PSP is destructed here |
| } |
| end = clock(); |
| std::cout << "creation time: " << (end-start) << " clocks(" << (double) (end-start)/CLOCKS_PER_SEC << " sec)" << std::endl; |
| |
| std::cout << "iterating over huge TsPSPIntSLListNode" << std::endl; |
| start = clock(); |
| // the following code calls copy constructor instead of assignment operator |
| TsPersistentSmartPointer<TsPSPIntSLLNode> current = huge.getHeadNode(); |
| while(current->hasNext()){ |
| if(current->getIndex() % 10000 == 0) |
| std::cout << "index:" << current->getIndex() << std::endl; |
| current = current->moveNext(); |
| } |
| end = clock(); |
| std::cout << "iteration time: " << (end-start) << " clocks(" << (double) (end-start)/CLOCKS_PER_SEC << " sec)" << std::endl; |
| std::cout << "completed huge TsPSPIntSLList" << std::endl; |
| |
| // followings are called automatically, but here explicitely for measurement |
| //current.~TsPersistentSmartPointer(); |
| //huge.~TsPSPIntSLList(); |
| |
| } |
Please note no code is required for the destructor of TsPersistentSmartPointer.
So what about reference linking?
No. It can't be lock free if with doubly linked list. Two links to prev and next have to be updated at the same time, otherwise one could see inconsistent stete.
Performance?
I have measured performance with clock() function defined in "time.h" as in TsPSPIntSLList.cpp.
Raw Pointer for single thread
Creation: 0.01 sec
Iteration: 0.02 sec
SmartPointer for single thread
Creation: 0.27 sec
Iteration: 0.05 sec
Lock-free SmartPointer
Creation: 0.45 sec
Iteration: 0.12 sec
In time, there's a significant overhead over plain raw pointer implementation.
Also, a couple of CPU cache statistics collected by vaigrind.
Raw Pointer for single thread
Instruction Read: 409,523,865
Data Read: 115,388,899
L1 Data Read Miss: 2,013,421
L2 Data Read Miss: 2,005,354
Data Write: 69,133,007
L1 Data Write Miss: 1,002,531
L2 Data Write Miss: 1,001,018
Smart Pointer for single thread
Instruction Read: 960,359,665
Data Read: 284,618,659
L1 Data Read Miss: 2,049,844
L2 Data Read Miss: 2,041,472
Data Write: 172,410,809
L1 Data Write Miss: 1,916,330
L2 Data Write Miss: 1,914,709
Lock-free Smart Pointer
Instruction Read: 1,161,535,161
Data Read: 317,391,770
L1 Data Read Miss: 2,013,687
L2 Data Read Miss: 2,005,198
Data Write: 235,134,100
L1 Data Write Miss: 2,002,377
L2 Data Write Miss: 2,001,012
(pthread Mutex SmartPointer)
Creation: 0.51 sec
Iteration: 0.14 sec
Instruction Read: 1,149,615,787
Data Read: 311,412,927
L1 Data Read Miss: 3,014,567
L2 Data Read Miss: 3,005,312
Data Write: 237,144,326
L1 Data Write Miss: 3,002,797
L2 Data Write Miss: 3,001,194
Please note L1 and L2 cache read miss stays around as total number increases.
(except mutex: mutex calls another million of malloc)
It is not free lunch. But for read intensive applications, how attractive it is!