swift/test/probe/test_container_failures.py

388 lines
18 KiB
Python
Raw Normal View History

2010-07-12 17:03:45 -05:00
#!/usr/bin/python -u
# Copyright (c) 2010-2011 OpenStack, LLC.
2010-07-12 17:03:45 -05:00
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
2011-04-13 10:57:59 -07:00
import os
2010-07-12 17:03:45 -05:00
from os import kill
from signal import SIGTERM
from subprocess import Popen
from time import sleep
from uuid import uuid4
2011-04-13 10:57:59 -07:00
import eventlet
import sqlite3
2010-07-12 17:03:45 -05:00
2011-06-08 04:19:34 +00:00
from swift.common import client, direct_client
2011-04-13 10:57:59 -07:00
from swift.common.utils import hash_path, readconf
2010-07-12 17:03:45 -05:00
from test.probe.common import get_to_final_state, kill_pids, reset_environment
2010-07-12 17:03:45 -05:00
class TestContainerFailures(unittest.TestCase):
def setUp(self):
self.pids, self.port2server, self.account_ring, self.container_ring, \
self.object_ring, self.url, self.token, self.account = \
reset_environment()
def tearDown(self):
kill_pids(self.pids)
def test_first_node_fail(self):
container = 'container-%s' % uuid4()
client.put_container(self.url, self.token, container)
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
object1 = 'object1'
client.put_object(self.url, self.token, container, object1, 'test')
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
self.assert_(object1 in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
cpart, cnodes = self.container_ring.get_nodes(self.account, container)
kill(self.pids[self.port2server[cnodes[0]['port']]], SIGTERM)
client.delete_object(self.url, self.token, container, object1)
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
self.assert_(object1 not in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
self.pids[self.port2server[cnodes[0]['port']]] = \
Popen(['swift-container-server',
2010-07-12 17:03:45 -05:00
'/etc/swift/container-server/%d.conf' %
((cnodes[0]['port'] - 6001) / 10)]).pid
sleep(2)
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
# This okay because the first node hasn't got the update that the
# object was deleted yet.
self.assert_(object1 in [o['name'] for o in
2011-06-08 04:19:34 +00:00
direct_client.direct_get_container(cnodes[0], cpart,
self.account, container)[1]])
2010-07-12 17:03:45 -05:00
# Unfortunately, the following might pass or fail, depending on the
# position of the account server associated with the first container
# server we had killed. If the associated happens to be the first
# account server, this'll pass, otherwise the first account server will
# serve the listing and not have the container.
# self.assert_(container in [c['name'] for c in
# client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
object2 = 'object2'
# This will work because at least one (in this case, just one) account
# server has to indicate the container exists for the put to continue.
client.put_object(self.url, self.token, container, object2, 'test')
# First node still doesn't know object1 was deleted yet; this is okay.
self.assert_(object1 in [o['name'] for o in
2011-06-08 04:19:34 +00:00
direct_client.direct_get_container(cnodes[0], cpart,
self.account, container)[1]])
2010-07-12 17:03:45 -05:00
# And, of course, our new object2 exists.
self.assert_(object2 in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
get_to_final_state()
# Our container delete never "finalized" because we started using it
# before the delete settled.
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
# And, so our object2 should still exist and object1's delete should
# have finalized.
self.assert_(object1 not in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
self.assert_(object2 in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
def test_second_node_fail(self):
container = 'container-%s' % uuid4()
client.put_container(self.url, self.token, container)
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
object1 = 'object1'
client.put_object(self.url, self.token, container, object1, 'test')
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
self.assert_(object1 in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
cpart, cnodes = self.container_ring.get_nodes(self.account, container)
kill(self.pids[self.port2server[cnodes[1]['port']]], SIGTERM)
client.delete_object(self.url, self.token, container, object1)
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
self.assert_(object1 not in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
self.pids[self.port2server[cnodes[1]['port']]] = \
Popen(['swift-container-server',
2010-07-12 17:03:45 -05:00
'/etc/swift/container-server/%d.conf' %
((cnodes[1]['port'] - 6001) / 10)]).pid
sleep(2)
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
self.assert_(object1 not in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
# Unfortunately, the following might pass or fail, depending on the
# position of the account server associated with the first container
# server we had killed. If the associated happens to be the first
# account server, this'll pass, otherwise the first account server will
# serve the listing and not have the container.
# self.assert_(container in [c['name'] for c in
# client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
object2 = 'object2'
# This will work because at least one (in this case, just one) account
# server has to indicate the container exists for the put to continue.
client.put_object(self.url, self.token, container, object2, 'test')
self.assert_(object1 not in [o['name'] for o in
2011-06-08 04:19:34 +00:00
direct_client.direct_get_container(cnodes[0], cpart,
self.account, container)[1]])
2010-07-12 17:03:45 -05:00
# And, of course, our new object2 exists.
self.assert_(object2 in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
get_to_final_state()
# Our container delete never "finalized" because we started using it
# before the delete settled.
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
# And, so our object2 should still exist and object1's delete should
# have finalized.
self.assert_(object1 not in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
self.assert_(object2 in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
def test_first_two_nodes_fail(self):
container = 'container-%s' % uuid4()
client.put_container(self.url, self.token, container)
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
object1 = 'object1'
client.put_object(self.url, self.token, container, object1, 'test')
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
self.assert_(object1 in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
cpart, cnodes = self.container_ring.get_nodes(self.account, container)
for x in xrange(2):
kill(self.pids[self.port2server[cnodes[x]['port']]], SIGTERM)
client.delete_object(self.url, self.token, container, object1)
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
self.assert_(object1 not in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
for x in xrange(2):
self.pids[self.port2server[cnodes[x]['port']]] = \
Popen(['swift-container-server',
2010-07-12 17:03:45 -05:00
'/etc/swift/container-server/%d.conf' %
((cnodes[x]['port'] - 6001) / 10)]).pid
sleep(2)
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
# This okay because the first node hasn't got the update that the
# object was deleted yet.
self.assert_(object1 in [o['name'] for o in
2011-06-08 04:19:34 +00:00
direct_client.direct_get_container(cnodes[0], cpart,
self.account, container)[1]])
2010-07-12 17:03:45 -05:00
# This fails because all three nodes have to indicate deletion before
# we tell the user it worked. Since the first node 409s (it hasn't got
# the update that the object was deleted yet), the whole must 503
# (until every is synced up, then the delete would work).
exc = None
try:
client.delete_container(self.url, self.token, container)
except client.ClientException, err:
exc = err
self.assert_(exc)
self.assert_(exc.http_status, 503)
# Unfortunately, the following might pass or fail, depending on the
# position of the account server associated with the first container
# server we had killed. If the associated happens to be the first
# account server, this'll pass, otherwise the first account server will
# serve the listing and not have the container.
# self.assert_(container in [c['name'] for c in
# client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
object2 = 'object2'
# This will work because at least one (in this case, just one) account
# server has to indicate the container exists for the put to continue.
client.put_object(self.url, self.token, container, object2, 'test')
# First node still doesn't know object1 was deleted yet; this is okay.
self.assert_(object1 in [o['name'] for o in
2011-06-08 04:19:34 +00:00
direct_client.direct_get_container(cnodes[0], cpart,
self.account, container)[1]])
2010-07-12 17:03:45 -05:00
# And, of course, our new object2 exists.
self.assert_(object2 in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
get_to_final_state()
# Our container delete never "finalized" because we started using it
# before the delete settled.
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
# And, so our object2 should still exist and object1's delete should
# have finalized.
self.assert_(object1 not in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
self.assert_(object2 in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
def test_last_two_nodes_fail(self):
container = 'container-%s' % uuid4()
client.put_container(self.url, self.token, container)
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
object1 = 'object1'
client.put_object(self.url, self.token, container, object1, 'test')
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
self.assert_(object1 in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
cpart, cnodes = self.container_ring.get_nodes(self.account, container)
for x in (1, 2):
kill(self.pids[self.port2server[cnodes[x]['port']]], SIGTERM)
client.delete_object(self.url, self.token, container, object1)
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
self.assert_(object1 not in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
for x in (1, 2):
self.pids[self.port2server[cnodes[x]['port']]] = \
Popen(['swift-container-server',
2010-07-12 17:03:45 -05:00
'/etc/swift/container-server/%d.conf' %
((cnodes[x]['port'] - 6001) / 10)]).pid
sleep(2)
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
self.assert_(object1 not in [o['name'] for o in
2011-06-08 04:19:34 +00:00
direct_client.direct_get_container(cnodes[0], cpart,
self.account, container)[1]])
2010-07-12 17:03:45 -05:00
# This fails because all three nodes have to indicate deletion before
# we tell the user it worked. Since the first node 409s (it hasn't got
# the update that the object was deleted yet), the whole must 503
# (until every is synced up, then the delete would work).
exc = None
try:
client.delete_container(self.url, self.token, container)
except client.ClientException, err:
exc = err
self.assert_(exc)
self.assert_(exc.http_status, 503)
# Unfortunately, the following might pass or fail, depending on the
# position of the account server associated with the first container
# server we had killed. If the associated happens to be the first
# account server, this'll pass, otherwise the first account server will
# serve the listing and not have the container.
# self.assert_(container in [c['name'] for c in
# client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
object2 = 'object2'
# This will work because at least one (in this case, just one) account
# server has to indicate the container exists for the put to continue.
client.put_object(self.url, self.token, container, object2, 'test')
self.assert_(object1 not in [o['name'] for o in
2011-06-08 04:19:34 +00:00
direct_client.direct_get_container(cnodes[0], cpart,
self.account, container)[1]])
2010-07-12 17:03:45 -05:00
# And, of course, our new object2 exists.
self.assert_(object2 in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
get_to_final_state()
# Our container delete never "finalized" because we started using it
# before the delete settled.
self.assert_(container in [c['name'] for c in
client.get_account(self.url, self.token)[1]])
2010-07-12 17:03:45 -05:00
# And, so our object2 should still exist and object1's delete should
# have finalized.
self.assert_(object1 not in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
self.assert_(object2 in [o['name'] for o in
client.get_container(self.url, self.token, container)[1]])
2010-07-12 17:03:45 -05:00
2011-04-13 10:57:59 -07:00
def _get_db_file_path(self, obj_dir):
files = sorted(os.listdir(obj_dir), reverse=True)
for file in files:
if file.endswith('db'):
return os.path.join(obj_dir, file)
def _get_container_db_files(self, container):
opart, onodes = self.container_ring.get_nodes(self.account, container)
onode = onodes[0]
db_files = []
for onode in onodes:
node_id = (onode['port'] - 6000) / 10
device = onode['device']
hash_str = hash_path(self.account, container)
server_conf = readconf('/etc/swift/container-server/%s.conf' %
node_id)
devices = server_conf['app:container-server']['devices']
obj_dir = '%s/%s/containers/%s/%s/%s/' % (devices,
device, opart,
hash_str[-3:], hash_str)
db_files.append(self._get_db_file_path(obj_dir))
return db_files
2011-04-11 16:26:50 -07:00
def test_locked_container_dbs(self):
2011-04-13 10:57:59 -07:00
def run_test(num_locks, catch_503):
container = 'container-%s' % uuid4()
client.put_container(self.url, self.token, container)
db_files = self._get_container_db_files(container)
db_conns = []
for i in range(num_locks):
db_conn = sqlite3.connect(db_files[i])
db_conn.execute('begin exclusive transaction')
db_conns.append(db_conn)
if catch_503:
try:
client.delete_container(self.url, self.token, container)
except client.ClientException, e:
self.assertEquals(e.http_status, 503)
else:
client.delete_container(self.url, self.token, container)
pool = eventlet.GreenPool()
try:
with eventlet.Timeout(15):
p = pool.spawn(run_test, 1, False)
r = pool.spawn(run_test, 2, True)
q = pool.spawn(run_test, 3, True)
pool.waitall()
except eventlet.Timeout, e:
raise Exception(
"The server did not return a 503 on container db locks, "
"it just hangs: %s" % e)
2010-07-12 17:03:45 -05:00
if __name__ == '__main__':
unittest.main()