10 Commits

Author SHA1 Message Date
lash
5459d4c3f8 Upgrade deps 2022-05-04 18:22:29 +00:00
lash
3e05717395 Add error line for rpc fails in dispatch 2022-05-04 06:44:40 +00:00
lash
54d10ee40b Upgrade chainqueue 2022-05-04 05:43:39 +00:00
lash
9cffdc5867 Factor out dispatch processor from chain implementation 2022-05-04 05:38:01 +00:00
lash
4f96be2024 Upgrade chainqueue 2022-05-03 17:22:59 +00:00
lash
32e1bc6aa5 Correct purge call, add missing lock module 2022-05-02 20:10:30 +00:00
lash
387014f77b Purge items from memory state on final 2022-05-02 20:05:41 +00:00
lash
c3a592c0f6 Implement shep store lock 2022-05-02 09:59:13 +00:00
lash
5d61506133 WIP whackamole race condition problems 2022-05-01 07:55:51 +00:00
lash
5102b4ac6e Fix crashes related to race condition hits 2022-05-01 07:31:18 +00:00
11 changed files with 203 additions and 61 deletions

View File

@@ -1,3 +1,15 @@
- 0.2.7
* Upgrade chainlib
- 0.2.6
* Deps upgrade
- 0.2.5
* Deps upgrade
- 0.2.4
* Allow omission of state store sync in queue store backend
- 0.2.2
* Fix missing symbol crashes related to race conditions
- 0.2.1
* Receive removed race checks from chainqueue
- 0.2.0 - 0.2.0
* primitive race condition handling between fs access of sync and queue * primitive race condition handling between fs access of sync and queue
* re-enable throttling based on in-flight transaction count * re-enable throttling based on in-flight transaction count

View File

@@ -1,28 +1,27 @@
# standard imports
import logging
import time
# external imports # external imports
from chainqueue import Store as QueueStore from chainqueue import Store as QueueStore
# local imports # local imports
from chaind.error import BackendIntegrityError from chaind.lock import StoreLock
logg = logging.getLogger(__name__)
class ChaindAdapter: class ChaindAdapter:
race_delay = 0.1 def __init__(self, chain_spec, state_store, index_store, counter_store, cache_adapter, dispatcher, cache=None, pending_retry_threshold=0, error_retry_threshold=0, store_sync=True):
def __init__(self, chain_spec, state_store, index_store, counter_store, cache_adapter, dispatcher, cache=None, pending_retry_threshold=0, error_retry_threshold=0):
self.cache_adapter = cache_adapter self.cache_adapter = cache_adapter
self.dispatcher = dispatcher self.dispatcher = dispatcher
err = None store_lock = StoreLock()
for i in range(3): while True:
try: try:
self.store = QueueStore(chain_spec, state_store, index_store, counter_store, cache=cache) self.store = QueueStore(chain_spec, state_store, index_store, counter_store, cache=cache, sync=store_sync)
err = None
break break
except FileNotFoundError as e: except FileNotFoundError as e:
logg.debug('queuestore instantiation failed, possible race condition (will try again): {}'.format(tx_hash, e)) logg.debug('queuestore instantiation failed, possible race condition (will try again): {}'.format(e))
err = e store_lock.again()
time.sleep(self.race_delay)
continue continue
if err != None:
raise BackendIntegrityError(err)

View File

@@ -1,6 +1,7 @@
# standard imports # standard imports
import logging import logging
import os import os
import time
# external imports # external imports
from chainlib.error import RPCException from chainlib.error import RPCException
@@ -11,24 +12,27 @@ from chainqueue.store.fs import (
CounterStore, CounterStore,
) )
from shep.store.file import SimpleFileStoreFactory from shep.store.file import SimpleFileStoreFactory
from shep.error import StateInvalid from shep.error import (
StateInvalid,
StateLockedKey,
)
# local imports # local imports
from .base import ChaindAdapter from .base import ChaindAdapter
from chaind.error import BackendIntegrityError from chaind.lock import StoreLock
logg = logging.getLogger(__name__) logg = logging.getLogger(__name__)
class ChaindFsAdapter(ChaindAdapter): class ChaindFsAdapter(ChaindAdapter):
def __init__(self, chain_spec, path, cache_adapter, dispatcher, cache=None, pending_retry_threshold=0, error_retry_threshold=0, digest_bytes=32, event_callback=None): def __init__(self, chain_spec, path, cache_adapter, dispatcher, cache=None, pending_retry_threshold=0, error_retry_threshold=0, digest_bytes=32, event_callback=None, store_sync=True):
factory = SimpleFileStoreFactory(path).add factory = SimpleFileStoreFactory(path, use_lock=True).add
state_store = Status(factory, allow_invalid=True, event_callback=event_callback) state_store = Status(factory, allow_invalid=True, event_callback=event_callback)
index_path = os.path.join(path, 'tx') index_path = os.path.join(path, 'tx')
index_store = IndexStore(index_path, digest_bytes=digest_bytes) index_store = IndexStore(index_path, digest_bytes=digest_bytes)
counter_store = CounterStore(path) counter_store = CounterStore(path)
super(ChaindFsAdapter, self).__init__(chain_spec, state_store, index_store, counter_store, cache_adapter, dispatcher, cache=cache, pending_retry_threshold=pending_retry_threshold, error_retry_threshold=error_retry_threshold) super(ChaindFsAdapter, self).__init__(chain_spec, state_store, index_store, counter_store, cache_adapter, dispatcher, cache=cache, pending_retry_threshold=pending_retry_threshold, error_retry_threshold=error_retry_threshold, store_sync=store_sync)
def put(self, signed_tx): def put(self, signed_tx):
@@ -38,21 +42,23 @@ class ChaindFsAdapter(ChaindAdapter):
def get(self, tx_hash): def get(self, tx_hash):
v = None v = None
err = None store_lock = StoreLock()
for i in range(3): while True:
try: try:
v = self.store.get(tx_hash) v = self.store.get(tx_hash)
err = None break
except StateInvalid as e: except StateInvalid as e:
logg.error('I am just a simple syncer and do not know how to handle the state which the tx {} is in: {}'.format(tx_hash, e)) logg.error('I am just a simple syncer and do not know how to handle the state which the tx {} is in: {}'.format(tx_hash, e))
return None return None
except FileNotFoundError as e: except FileNotFoundError as e:
err = e logg.debug('queuestore get (file missing) {} failed, possible race condition (will try again): {}'.format(tx_hash, e))
time.sleep(self.race_delay) store_lock.again()
logg.debug('queuestore get {} failed, possible race condition (will try again): {}'.format(tx_hash, e))
continue continue
if v ==None: except StateLockedKey as e:
raise BackendIntegrityError(tx_hash) logg.debug('queuestore get (statelock) {} failed, possible race condition (will try again): {}'.format(tx_hash, e))
store_lock.again()
continue
return v[1] return v[1]
@@ -84,12 +90,19 @@ class ChaindFsAdapter(ChaindAdapter):
def succeed(self, block, tx): def succeed(self, block, tx):
if self.store.is_reserved(tx.hash): if self.store.is_reserved(tx.hash):
raise QueueLockError(tx.hash) raise QueueLockError(tx.hash)
r = self.store.final(tx.hash, block, tx, error=False)
return self.store.final(tx.hash, block, tx, error=False) (k, v) = self.store.get(tx.hash)
self.store.purge(k)
return r
def fail(self, block, tx): def fail(self, block, tx):
return self.store.final(tx.hash, block, tx, error=True) if self.store.is_reserved(tx.hash):
raise QueueLockError(tx.hash)
r = self.store.final(tx.hash, block, tx, error=True)
(k, v) = self.store.get(tx.hash)
self.store.purge(k)
return r
def sendfail(self): def sendfail(self):
@@ -101,15 +114,44 @@ class ChaindFsAdapter(ChaindAdapter):
def dispatch(self, tx_hash): def dispatch(self, tx_hash):
entry = self.store.send_start(tx_hash) entry = None
store_lock = StoreLock()
while True:
try:
entry = self.store.send_start(tx_hash)
break
except FileNotFoundError as e:
logg.debug('dispatch failed to find {} in backend, will try again: {}'.format(tx_hash, e))
store_lock.again()
continue
except StateLockedKey as e:
logg.debug('dispatch failed to find {} in backend, will try again: {}'.format(tx_hash, e))
store_lock.again()
continue
tx_wire = entry.serialize() tx_wire = entry.serialize()
r = None r = None
try: try:
r = self.dispatcher.send(tx_wire) r = self.dispatcher.send(tx_wire)
except RPCException: except RPCException as e:
logg.error('dispatch send failed for {}: {}'.format(tx_hash, e))
self.store.fail(tx_hash) self.store.fail(tx_hash)
return False return False
self.store.send_end(tx_hash) store_lock = StoreLock()
while True:
try:
self.store.send_end(tx_hash)
break
except FileNotFoundError as e:
logg.debug('dispatch failed to find {} in backend, will try again: {}'.format(tx_hash, e))
store_lock.again(e)
continue
except StateLockedKey as e:
logg.debug('dispatch failed to find {} in backend, will try again: {}'.format(tx_hash, e))
store_lock.again(e)
continue
return True return True

33
chaind/dispatch.py Normal file
View File

@@ -0,0 +1,33 @@
# standard imports
import logging
# local ipmorts
from chaind.adapters.fs import ChaindFsAdapter
from chaind.eth.cache import EthCacheTx
logg = logging.getLogger(__name__)
class DispatchProcessor:
def __init__(self, chain_spec, queue_dir, dispatcher):
self.dispatcher = dispatcher
self.chain_spec = chain_spec,
self.queue_dir = queue_dir
def process(self, rpc, limit=50):
adapter = ChaindFsAdapter(
self.chain_spec,
self.queue_dir,
EthCacheTx,
self.dispatcher,
)
upcoming = adapter.upcoming(limit=limit)
logg.info('processor has {} candidates for {}, processing with limit {}'.format(len(upcoming), self.chain_spec, limit))
i = 0
for tx_hash in upcoming:
if adapter.dispatch(tx_hash):
i += 1
return i

View File

@@ -22,5 +22,5 @@ class QueueLockError(Exception):
pass pass
class BackendIntegrityError(Exception): class BackendError(Exception):
pass pass

View File

@@ -7,21 +7,20 @@ from chainlib.status import Status as TxStatus
from chainsyncer.filter import SyncFilter from chainsyncer.filter import SyncFilter
from chainqueue.error import NotLocalTxError from chainqueue.error import NotLocalTxError
from chaind.adapters.fs import ChaindFsAdapter from chaind.adapters.fs import ChaindFsAdapter
from shep.error import StateLockedKey
# local imports # local imports
from .error import ( from .error import (
QueueLockError, QueueLockError,
BackendIntegrityError, BackendError,
) )
from chaind.lock import StoreLock
logg = logging.getLogger(__name__) logg = logging.getLogger(__name__)
class StateFilter(SyncFilter): class StateFilter(SyncFilter):
delay_limit = 3.0
race_delay = 0.1
def __init__(self, chain_spec, adapter_path, tx_adapter, throttler=None): def __init__(self, chain_spec, adapter_path, tx_adapter, throttler=None):
self.chain_spec = chain_spec self.chain_spec = chain_spec
self.adapter_path = adapter_path self.adapter_path = adapter_path
@@ -31,8 +30,9 @@ class StateFilter(SyncFilter):
def filter(self, conn, block, tx, session=None): def filter(self, conn, block, tx, session=None):
cache_tx = None cache_tx = None
for i in range(3): store_lock = StoreLock()
queue_adapter = None queue_adapter = None
while True:
try: try:
queue_adapter = ChaindFsAdapter( queue_adapter = ChaindFsAdapter(
self.chain_spec, self.chain_spec,
@@ -40,30 +40,31 @@ class StateFilter(SyncFilter):
self.tx_adapter, self.tx_adapter,
None, None,
) )
except BackendIntegrityError as e: except BackendError as e:
logg.error('adapter instantiation failed: {}, one more try'.format(e)) logg.error('adapter instantiation failed: {}, one more try'.format(e))
time.sleep(self.race_delay) store_lock.again()
continue continue
store_lock.reset()
try: try:
cache_tx = queue_adapter.get(tx.hash) cache_tx = queue_adapter.get(tx.hash)
break
except NotLocalTxError: except NotLocalTxError:
logg.debug('skipping not local transaction {}'.format(tx.hash)) logg.debug('skipping not local transaction {}'.format(tx.hash))
return False return False
except BackendIntegrityError as e: except BackendError as e:
logg.error('adapter instantiation failed: {}, one more try'.format(e)) logg.error('adapter get failed: {}, one more try'.format(e))
time.sleep(self.race_delay) queue_adapter = None
store_lock.again()
continue continue
break
if cache_tx == None: if cache_tx == None:
raise NotLocalTxError(tx.hash) raise NotLocalTxError(tx.hash)
delay = 0.01 store_lock = StoreLock()
queue_lock = StoreLock(error=QueueLockError)
while True: while True:
if delay > self.delay_limit:
raise QueueLockError('The queue lock for tx {} seems to be stuck. Human meddling needed.'.format(tx.hash))
try: try:
if tx.status == TxStatus.SUCCESS: if tx.status == TxStatus.SUCCESS:
queue_adapter.succeed(block, tx) queue_adapter.succeed(block, tx)
@@ -72,8 +73,21 @@ class StateFilter(SyncFilter):
break break
except QueueLockError as e: except QueueLockError as e:
logg.debug('queue item {} is blocked, will retry: {}'.format(tx.hash, e)) logg.debug('queue item {} is blocked, will retry: {}'.format(tx.hash, e))
time.sleep(delay) queue_lock.again()
delay *= 2 except FileNotFoundError as e:
logg.debug('queue item {} not found, possible race condition, will retry: {}'.format(tx.hash, e))
store_lock.again()
continue
except NotLocalTxError as e:
logg.debug('queue item {} not found, possible race condition, will retry: {}'.format(tx.hash, e))
store_lock.again()
continue
except StateLockedKey as e:
logg.debug('queue item {} not found, possible race condition, will retry: {}'.format(tx.hash, e))
store_lock.again()
continue
logg.info('filter registered {} for {} in {}'.format(tx.status.name, tx.hash, block))
if self.throttler != None: if self.throttler != None:
self.throttler.dec(tx.hash) self.throttler.dec(tx.hash)

34
chaind/lock.py Normal file
View File

@@ -0,0 +1,34 @@
# standard imports
import time
# local imports
from .error import BackendError
BASE_DELAY = 0.01
BASE_DELAY_LIMIT = 10.0
class StoreLock:
def __init__(self, delay=BASE_DELAY, delay_limit=BASE_DELAY_LIMIT, error=BackendError, description=None):
self.base_delay = delay
self.delay = delay
self.delay_limit = delay_limit
self.error = error
self.description = description
def again(self, e=None):
if self.delay > self.delay_limit:
err = None
if e != None:
err = str(e)
else:
err = self.description
raise self.error(err)
time.sleep(self.delay)
self.delay *= 2
def reset(self):
self.delay = self.base_delay

View File

@@ -8,19 +8,21 @@ import stat
from hexathon import strip_0x from hexathon import strip_0x
# local imports # local imports
from chaind.error import ( from .error import (
NothingToDoError, NothingToDoError,
ClientGoneError, ClientGoneError,
ClientBlockError, ClientBlockError,
ClientInputError, ClientInputError,
) )
from .lock import StoreLock
from .error import BackendError
logg = logging.getLogger(__name__) logg = logging.getLogger(__name__)
class SessionController: class SessionController:
def __init__(self, config, adapter, processor): def __init__(self, config, processor):
self.dead = False self.dead = False
os.makedirs(os.path.dirname(config.get('SESSION_SOCKET_PATH')), exist_ok=True) os.makedirs(os.path.dirname(config.get('SESSION_SOCKET_PATH')), exist_ok=True)
try: try:
@@ -35,7 +37,6 @@ class SessionController:
self.srv.settimeout(float(config.get('SESSION_DISPATCH_DELAY'))) self.srv.settimeout(float(config.get('SESSION_DISPATCH_DELAY')))
self.processor = processor self.processor = processor
self.chain_spec = config.get('CHAIN_SPEC') self.chain_spec = config.get('CHAIN_SPEC')
self.adapter = adapter
def shutdown(self, signo, frame): def shutdown(self, signo, frame):
@@ -59,7 +60,16 @@ class SessionController:
def process(self, conn): def process(self, conn):
r = self.processor(self.chain_spec, self.adapter, conn) state_lock = StoreLock()
r = None
while True:
try:
r = self.processor(conn)
break
except BackendError as e:
state_lock.again(e)
continue
if r > 0: if r > 0:
self.srv.settimeout(0.1) self.srv.settimeout(0.1)
else: else:

View File

@@ -29,8 +29,6 @@ class MockDispatcher:
def send(self, v): def send(self, v):
import sys
sys.stderr.write('susu v {} {}\n'.format(v, self.fails))
if v in self.fails: if v in self.fails:
raise RPCException('{} is in fails'.format(v)) raise RPCException('{} is in fails'.format(v))
pass pass

View File

@@ -1,5 +1,5 @@
chainlib~=0.1.1 chainlib~=0.1.2
chainqueue~=0.1.6 chainqueue~=0.1.11
chainsyncer~=0.4.3 chainsyncer~=0.4.3
confini~=0.6.0 confini~=0.6.0
funga~=0.5.2 funga~=0.5.2

View File

@@ -1,6 +1,6 @@
[metadata] [metadata]
name = chaind name = chaind
version = 0.2.1 version = 0.2.7
description = Base package for chain queue service description = Base package for chain queue service
author = Louis Holbrook author = Louis Holbrook
author_email = dev@holbrook.no author_email = dev@holbrook.no