Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
N
neoppod
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
1
Issues
1
List
Boards
Labels
Milestones
Merge Requests
2
Merge Requests
2
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
nexedi
neoppod
Commits
d66b4f24
Commit
d66b4f24
authored
Nov 06, 2018
by
Julien Muchembled
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
storage: check for conflicts when notifying that the a partition is replicated
parent
f25b8ee3
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
124 additions
and
6 deletions
+124
-6
neo/storage/transactions.py
neo/storage/transactions.py
+16
-6
neo/tests/threaded/test.py
neo/tests/threaded/test.py
+108
-0
No files found.
neo/storage/transactions.py
View file @
d66b4f24
...
...
@@ -178,11 +178,21 @@ class TransactionManager(EventQueue):
self
.
_app
.
master_conn
.
send
(
Packets
.
NotifyReplicationDone
(
partition
,
replicated
[
partition
]))
replicated
[
partition
]
=
None
for
oid
,
ttid
in
store_lock_dict
.
iteritems
():
if
getPartition
(
oid
)
in
notify
:
for
oid
in
[
oid
for
oid
in
store_lock_dict
if
getPartition
(
oid
)
in
notify
]:
ttid
=
store_lock_dict
.
pop
(
oid
)
txn
=
self
.
_transaction_dict
[
ttid
]
# Use 'discard' instead of 'remove', for oids that were
# locked after that the partition was replicated.
self
.
_transaction_dict
[
ttid
].
lockless
.
discard
(
oid
)
txn
.
lockless
.
discard
(
oid
)
try
:
locked
=
self
.
lockObject
(
ttid
,
txn
.
serial_dict
[
oid
],
oid
)
except
ConflictError
:
self
.
_unstore
(
txn
,
oid
)
except
(
DelayEvent
,
NonReadableCell
),
e
:
# pragma: no cover
raise
AssertionError
(
e
)
else
:
assert
locked
,
(
oid
,
ttid
,
txn
)
def
register
(
self
,
conn
,
ttid
):
"""
...
...
@@ -277,7 +287,7 @@ class TransactionManager(EventQueue):
self
.
lockObject
(
ttid
,
serial
,
oid
)
except
ConflictError
:
recheck_set
.
add
(
oid
)
except
Exception
,
e
:
# pragma: no cover
except
(
DelayEvent
,
NonReadableCell
)
,
e
:
# pragma: no cover
raise
AssertionError
(
e
)
return
recheck_set
...
...
neo/tests/threaded/test.py
View file @
d66b4f24
...
...
@@ -2364,6 +2364,114 @@ class Test(NEOThreadedTest):
self
.
assertFalse
(
end
)
self
.
assertPartitionTable
(
cluster
,
'UU|UU'
)
@
with_cluster
(
start_cluster
=
0
,
storage_count
=
2
,
replicas
=
1
)
def
testLocklessWriteDuringConflictResolution
(
self
,
cluster
):
"""
This test reproduces a scenario during which the storage node didn't
check for conflicts when notifying the master that a partition is
replicated. The consequence was that it kept having write locks in a
weird state, relying on replicas to do the checking. While this may
work in itself, the code was hazardous and the client can't easily
discard such "positive" answers, or process them early enough.
The scenario focuses on transaction t1 (storing object A)
and node s1 (initially replicating the only partition):
1. t1 stores: conflict on s0, lockless write on s1
2. t2 stores: locked on s0, lockless write on s1
3. t1 resolves: deadlock on s0, packet to s1 delayed
4. t2 commits: on s1, a single lockless write remains and the storage
node notifies the master that it is UP_TO_DATE
5. s1 receives the second store from t1: answer delayed
6. t2 begins a new transaction
7. t1 resolves the deadlock: conflict on s0, s1 asks to rebase
8. t2 stores and vote
9. s0 down
10. while t2 finishes, t1 starts solving the conflict and due to the
way packets are processed, it proceeds as follows:
a. answer in step 5 is received but not processed
b. data asked, but not for the last serial
(so there will be yet another conflict to solve)
c. new data is stored: client waiting for s1
d. answer in step 5 is processed
What happened before:
4. t1 still has the lock
5. locked ("Transaction storing more than once")
10d. store considered successful, and the data won't be there anymore
for the actual conflict
-> assertion failure
Now that the storage nodes discards lockless writes that actually
conflict:
4. t1 does not have the lock anymore
5. conflict
10d. ignored (conflict already resolved)
-> transaction aborted normally
"""
s0
,
s1
=
cluster
.
storage_list
cluster
.
start
(
storage_list
=
(
s0
,))
t1
,
c1
=
cluster
.
getTransaction
()
x1
=
c1
.
root
()[
''
]
=
PCounterWithResolution
()
t1
.
commit
()
x1
.
value
+=
1
with
cluster
.
newClient
(
1
)
as
db
,
ConnectionFilter
()
as
f
:
delayReplication
=
f
.
delayAnswerFetchObjects
()
delayed
=
[]
delayStore
=
f
.
delayAskStoreObject
(
lambda
conn
:
conn
.
uuid
in
delayed
and
self
.
getConnectionApp
(
conn
)
is
cluster
.
client
)
delayStored
=
f
.
delayAnswerStoreObject
(
lambda
conn
:
conn
.
uuid
==
cluster
.
client
.
uuid
and
self
.
getConnectionApp
(
conn
).
uuid
in
delayed
)
def
load
(
orig
,
oid
,
at_tid
,
before_tid
):
if
delayed
:
p
.
revert
()
f
.
remove
(
delayStored
)
s0
.
stop
()
cluster
.
join
((
s0
,))
self
.
tic
()
return
orig
(
oid
,
at_tid
,
before_tid
)
s1
.
start
()
self
.
tic
()
cluster
.
neoctl
.
enableStorageList
([
s1
.
uuid
])
cluster
.
neoctl
.
tweakPartitionTable
()
self
.
tic
()
t2
,
c2
=
cluster
.
getTransaction
(
db
)
x2
=
c2
.
root
()[
''
]
x2
.
value
+=
2
t2
.
commit
()
x2
.
value
+=
4
def
tic1
(
*
args
,
**
kw
):
yield
1
self
.
tic
()
def
t1_resolve
(
*
args
,
**
kw
):
delayed
.
append
(
s1
.
uuid
)
f
.
remove
(
delayReplication
)
self
.
tic
()
yield
1
self
.
tic
()
def
t2_begin
(
*
args
,
**
kw
):
f
.
remove
(
delayStore
)
yield
0
@
self
.
newPausedThread
def
commit2
():
t2
.
commit
()
x2
.
value
+=
8
t2
.
commit
()
with
Patch
(
cluster
.
client
,
_loadFromStorage
=
load
)
as
p
,
\
self
.
thread_switcher
((
commit2
,),
(
1
,
0
,
tic1
,
0
,
t1_resolve
,
1
,
t2_begin
,
0
,
1
,
1
,
0
),
(
'tpc_begin'
,
'tpc_begin'
,
1
,
1
,
1
,
'StoreTransaction'
,
'tpc_begin'
,
'RebaseTransaction'
,
'RebaseTransaction'
,
1
,
'StoreTransaction'
))
as
end
:
self
.
assertRaisesRegexp
(
NEOStorageError
,
'^partition 0 not fully write-locked$'
,
t1
.
commit
)
commit2
.
join
()
t1
.
begin
()
self
.
assertEqual
(
x1
.
value
,
14
)
self
.
assertPartitionTable
(
cluster
,
'OU'
)
@
with_cluster
(
partitions
=
2
,
storage_count
=
2
)
def
testUnstore
(
self
,
cluster
):
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment