So far, so bad.
I found a couple of things. A couple of bad things.
1. shared_ptr is not thread safe * it was stack overflow by recursive destruction
Here's the code to reproduce. The reason follows.
SPIntSLList.h
Code:
#ifndef INT_SINGLY_LINKEDLIST | |
#define INT_SINGLY_LINKEDLIST | |
| |
#include <boost/shared_ptr.hpp> | |
| |
class SPIntSLLNode { | |
| |
private: | |
int index; | |
long int payload[4]; | |
boost::shared_ptr<SPIntSLLNode> next; | |
| |
SPIntSLLNode(int ix){ | |
index = ix; | |
} | |
| |
public: | |
int hasNext(){return next != 0;} | |
boost::shared_ptr<SPIntSLLNode>& moveNext(){return next;} | |
int getIndex(){return index;} | |
friend class SPIntSLList; | |
}; | |
| |
class SPIntSLList { | |
private: | |
boost::shared_ptr<SPIntSLLNode> head, tail; | |
public: | |
~SPIntSLList(); | |
| |
void addNode(int index); | |
boost::shared_ptr<SPIntSLLNode>& getHeadNode(){return head;} | |
}; | |
| |
#endif |
SPIntSLList.cpp
Code:
#include <iostream> | |
#include <time.h> | |
#include "SPIntSLList.h" | |
| |
SPIntSLList::~SPIntSLList(){ | |
// no need iterative deletion | |
} | |
| |
void SPIntSLList::addNode(int index){ | |
boost::shared_ptr<SPIntSLLNode> n(new SPIntSLLNode(index)); | |
if(head == 0){ | |
head = n; | |
tail = n; | |
} | |
else | |
{ | |
tail->next = n; | |
tail = n; | |
} | |
} | |
| |
int main(){ | |
| |
std::cout << "sizeof(SPIntSLLNode) = " << sizeof(SPIntSLLNode) << std::endl; | |
| |
clock_t start, end; | |
| |
std::cout << "creating huge large SPIntSLList" << std::endl; | |
start = clock(); | |
SPIntSLList huge; | |
std::cout << "creating huge SPIntSLListNode" << std::endl; | |
// 157160: segmetation fault boarder when load was evaluated | |
for(int i=0; i<1000000; i++){ | |
huge.addNode(i); | |
// n, temporal PSP is destructed here | |
} | |
end = clock(); | |
std::cout << "creation time: " << (end-start) << " clocks(" << (double) (end-start)/CLOCKS_PER_SEC << " sec)" << std::endl; | |
| |
std::cout << "iterating over huge SPIntSLListNode" << std::endl; | |
start = clock(); | |
// the following code calls copy constructor instead of assignment operator | |
boost::shared_ptr<SPIntSLLNode> current = huge.getHeadNode(); | |
while(current->hasNext()){ | |
if(current->getIndex() % 10000 == 0) | |
std::cout << "index:" << current->getIndex() << std::endl; | |
current = current->moveNext(); | |
} | |
end = clock(); | |
std::cout << "iteration time: " << (end-start) << " clocks(" << (double) (end-start)/CLOCKS_PER_SEC << " sec)" << std::endl; | |
std::cout << "completed huge SPIntSLList" << std::endl; | |
} |
Then, a wonderful gift.
Segmentation fault (core dumped)
And here's the stack trace.
Code:
Program terminated with signal 11, Segmentation fault. | |
#0 0x0000000000401227 in boost::detail::atomic_exchange_and_add (pw=Cannot access memory at address 0x7fff2ee2dff8 | |
) | |
at /home/takenori/workspace_native/Boost/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp:36 | |
36 { | |
(gdb) bt | |
#0 0x0000000000401227 in boost::detail::atomic_exchange_and_add (pw=Cannot access memory at address 0x7fff2ee2dff8 | |
) | |
at /home/takenori/workspace_native/Boost/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp:36 | |
#1 0x000000000040137b in boost::detail::sp_counted_base::release ( | |
this=0xd921130) | |
at /home/takenori/workspace_native/Boost/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp:143 | |
#2 0x000000000040142b in boost::detail::shared_count::~shared_count ( | |
this=0xd9210c0, __in_chrg=<value optimized out>) | |
at /home/takenori/workspace_native/Boost/boost/smart_ptr/detail/shared_count.hpp:217 | |
#3 0x00000000004014be in boost::shared_ptr<SPIntSLLNode>::~shared_ptr ( | |
this=0xd9210b8, __in_chrg=<value optimized out>) | |
at /home/takenori/workspace_native/Boost/boost/smart_ptr/shared_ptr.hpp:163 |
They have the same mistake as mine.
Code:
~shared_count() // nothrow | |
{ | |
if( pi_ != 0 ) pi_->release(); | |
#if defined(BOOST_SP_ENABLE_DEBUG_HOOKS) | |
id_ = 0; | |
#endif | |
} |
After "pi_ != 0", pi is not guarranteed to stay valid. It is possible to delete after its evaluation.
The last two referencing threads can enter release().
Code:
void release() // nothrow | |
{ | |
if( atomic_exchange_and_add( &use_count_, -1 ) == 1 ) | |
{ | |
dispose(); | |
weak_release(); | |
} | |
} |
Then, one comes late experiences Segmentation Fault on $use_count_.
2. Atomic types and operations are not as fast as RCU(read copy update)
I expected this(from the presentation by liburcu).

But the performance of lock-free reference counting smart pointer is as slow as mutex, on my dual core linux box.
The difference might be less significant because it's only dual core on 1 CPU. But I doubt it.
The test was like this.
Code:
#include "TsPersistentSmartPointer.h" | |
#include <iostream> | |
#include <time.h> | |
| |
#define MAX_THREADS 100 | |
#define ITERATION 1000 | |
| |
class Foo{ | |
public: | |
Foo(int id) : id_(id){std::cout << id_ << ":constructer called" << std::endl;} | |
~Foo(){std::cout << id_ << ":destructer called" << std::endl;} | |
| |
void SayHello(){ | |
std::cout << "Hello, this is " << id_ << std::endl; | |
} | |
private: | |
int id_; | |
}; | |
| |
void *copy(void *arg){ | |
// 1. copy, adding a reference | |
TsPersistentSmartPointer<Foo> psp = *((TsPersistentSmartPointer<Foo>*) arg); | |
TsPersistentSmartPointer<Foo> pspCopied = psp; | |
| |
// 2. removing a reference when exiting | |
} | |
| |
int main(){ | |
| |
TsPersistentSmartPointer<Foo> psp(new Foo(3)); | |
pthread_t threads[MAX_THREADS]; | |
int rt; | |
| |
clock_t start, end; | |
std::cout << "Executing..." << std::endl; | |
start = clock(); | |
for(int i=0; i<ITERATION; i++) | |
{ | |
for(int t=0; t<MAX_THREADS; t++){ | |
rt = pthread_create(&threads[t], NULL, copy, (void*) &psp); | |
if(rt != 0){ | |
std::cout << "Failed to create thread " << t << std::endl; | |
} | |
} | |
| |
for(int t=0; t<MAX_THREADS; t++){ | |
rt = pthread_join(threads[t], NULL); | |
if(rt != 0){ | |
std::cout << "Failed to join thread " << t << std::endl; | |
} | |
} | |
} | |
| |
end = clock(); | |
std::cout << "execution time: " << (end-start) << " clocks(" << (double) (end-start)/CLOCKS_PER_SEC << " sec)" << std::endl; | |
} |
This is heavily read intensive test. But each read on reference counting SmartPointer requires incrementing the counter, and decrementing when exiting. So it is deadly heavy write intensive test. It would run as low as Mutex in the figure above.
liburcu is implemented with architecture specific assembly code. But what's the difference? That's what I ought to find...