Master process crash with "signal SIGSEGV: segmentation violation code" when uses multiple OPE instances of simple "string python" model, under high load
SIGSEGV happens inside "neuropod::tensor_from_id(std::__1::array<char, 24ul> const&)"
It doesn't happen immediately. There are particular conditions - high volume/load, particular number of concurrent callers.
"Master" Process with 4 OPE instances of the same "string_python" mode performs round-robin on incoming Inference requests. 4 concurrent callers perform Inference calls. Master crashes with SEGV not immediately but at "high volume". It is 100% reproducible if generate 20K calls. If I send per 2K calls, it can take many iterations to hit it.
07/14/20 21:02:41.269865: T ./neuropod/multiprocess/mq/ipc_message_queue_impl.hh:152] [thread 118825, process 118433] OPE: Sending IPC control message 2.
07/14/20 21:02:41.269898: T ./neuropod/multiprocess/mq/ipc_message_queue_impl.hh:152] [thread 118442, process 118433] OPE: Sending IPC control message 2.
07/14/20 21:02:41.269912: T ./neuropod/multiprocess/mq/ipc_message_queue_impl.hh:91] [thread 118735, process 118733] OPE: Read thread received IPC control message 2.
07/14/20 21:02:41.269922: T neuropod/multiprocess/mq/transferrables.cc:56] [thread 118735, process 118733] OPE: Clearing transferrables for msg with id 1032
fatal error: unexpected signal during runtime execution
07/14/20 21:02:41.269957: T ./neuropod/multiprocess/mq/ipc_message_queue_impl.hh:91] [thread 118597, process 118595] OPE: Read thread received IPC control message 2.
07/14/20 21:02:41.269975: T neuropod/multiprocess/mq/transferrables.cc:56] [thread 118597, process 118595] OPE: Clearing transferrables for msg with id 1033
[signal SIGSEGV: segmentation violation code=0x1 addr=0x7fd320c10480 pc=0x7fd320d43797]
==27092==Register values:
rax = 0x000060300025ab48 rbx = 0x000060700005aae0 rcx = 0x00000000000000c4 rdx = 0x00000000000000c5
rdi = 0x0000000011b87060 rsi = 0x0000000011b870e0 rbp = 0x000070000b7e29f0 rsp = 0x000070000b7e2920
r8 = 0x0000000000000098 r9 = 0x0000000011e41440 r10 = 0x0000000000000011 r11 = 0x0000000011e41458
r12 = 0x0000000000000000 r13 = 0x0000000011e414b8 r14 = 0x000060e000130ca0 r15 = 0x0000000000000000
AddressSanitizer can not provide additional info.
SUMMARY: AddressSanitizer: SEGV (/Users/vkuzmin/gocode/src/code.uber.internal/data/michelangelo-deeplearning-inference/lib/libneuropod.so:x86_64+0xbac7d) in neuropod::tensor_from_id(std::__1::array<char, 24ul> const&)
2020-07-07T02:53:31.656-0700 DEBUG model/models.go:153 Tensor: &Tensor{Value:[],Dimensions:[1 1],Dtype:DTYPE_STRING,StringVector:[benchmark],}
#0 0x5f01c7d in neuropod::tensor_from_id(std::__1::array<char, 24ul> const&) (/Users/vkuzmin/gocode/src/code.uber.internal/data/michelangelo-deeplearning-inference/lib/libneuropod.so:x86_64+0xbac7d)
#1 0x5f033db in void neuropod::ipc_deserialize<std::__1::shared_ptr<neuropod::NeuropodValue> >(std::__1::basic_istream<char, std::__1::char_traits<char> >&, std::__1::shared_ptr<neuropod::NeuropodValue>&) (/Users/vkuzmin/gocode/src/code.uber.internal/data/michelangelo-deeplearning-inference/lib/libneuropod.so:x86_64+0xbc3db)
#2 0x5ef9266 in void neuropod::ipc_deserialize<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::shared_ptr<neuropod::NeuropodValue> >(std::__1::basic_istream<char, std::__1::char_traits<char> >&, std::__1::unordered_map<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::shared_ptr<neuropod::NeuropodValue>, std::__1::hash<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::equal_to<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const, std::__1::shared_ptr<neuropod::NeuropodValue> > > >&) (/Users/vkuzmin/gocode/src/code.uber.internal/data/michelangelo-deeplearning-inference/lib/libneuropod.so:x86_64+0xb2266)
#3 0x5ef900f in void neuropod::detail::deserialize_payload<std::__1::unordered_map<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::shared_ptr<neuropod::NeuropodValue>, std::__1::hash<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::equal_to<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const, std::__1::shared_ptr<neuropod::NeuropodValue> > > >, neuropod::MessageType>(neuropod::detail::WireFormat<neuropod::MessageType> const&, std::__1::unordered_map<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::shared_ptr<neuropod::NeuropodValue>, std::__1::hash<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::equal_to<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const, std::__1::shared_ptr<neuropod::NeuropodValue> > > >&) (/Users/vkuzmin/gocode/src/code.uber.internal/data/michelangelo-deeplearning-inference/lib/libneuropod.so:x86_64+0xb200f)
#4 0x5ee67cb in neuropod::(anonymous namespace)::MultiprocessNeuropodBackend::infer_internal(std::__1::unordered_map<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::shared_ptr<neuropod::NeuropodValue>, std::__1::hash<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::equal_to<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const, std::__1::shared_ptr<neuropod::NeuropodValue> > > > const&, std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&) (/Users/vkuzmin/gocode/src/code.uber.internal/data/michelangelo-deeplearning-inference/lib/libneuropod.so:x86_64+0x9f7cb)
#5 0x5f3f08f in neuropod::NeuropodBackend::infer(std::__1::unordered_map<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::shared_ptr<neuropod::NeuropodValue>, std::__1::hash<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::equal_to<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const, std::__1::shared_ptr<neuropod::NeuropodValue> > > > const&, std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&) (/Users/vkuzmin/gocode/src/code.uber.internal/data/michelangelo-deeplearning-inference/lib/libneuropod.so:x86_64+0xf808f)
#6 0x5e4c340 in neuropod::Neuropod::infer(std::__1::unordered_map<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::shared_ptr<neuropod::NeuropodValue>, std::__1::hash<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::equal_to<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const, std::__1::shared_ptr<neuropod::NeuropodValue> > > > const&, std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&) (/Users/vkuzmin/gocode/src/code.uber.internal/data/michelangelo-deeplearning-inference/lib/libneuropod.so:x86_64+0x5340)
#7 0x494a31a in Infer (/Users/vkuzmin/gocode/src/code.uber.internal/data/michelangelo-deeplearning-inference/./ma-dl-inference:x86_64+0x494a31a)
#8 0x4948d07 in _cgo_f1624c697bd0_Cfunc_Infer (/Users/vkuzmin/gocode/src/code.uber.internal/data/michelangelo-deeplearning-inference/./ma-dl-inference:x86_64+0x4948d07)
#9 0x406918f in runtime.asmcgocall (/Users/vkuzmin/gocode/src/code.uber.internal/data/michelangelo-deeplearning-inference/./ma-dl-inference:x86_64+0x406918f)
class Model:
def __call__(self, str_input):
return {'str_output': str_input}
def get_model(data_root):
return Model()
It fails if there are 4 OPE instances and 4 concurrent callers.
It doesn't fail if I use 1, 2, 8 concurrent clients.
It fails if I send 20K messages, when it reaches ~3-4K. If I send by batches per 2 K, it may fails around 8K number.