More quarantine catching code. This should catch and quarantine any databases that indicate malformation or corruption with the known raised exceptions and it should catch and quarantine the case we've seen where hash directories become files.

This commit is contained in:
gholt 2011-08-04 16:52:26 +00:00 committed by Tarmac
commit 9ca33c8690
7 changed files with 150 additions and 33 deletions

View File

@ -22,6 +22,7 @@ import logging
import operator
import os
from uuid import uuid4
import sys
import time
import cPickle as pickle
import errno
@ -256,12 +257,46 @@ class DatabaseBroker(object):
self._delete_db(conn, timestamp)
conn.commit()
def possibly_quarantine(self, exc_type, exc_value, exc_traceback):
"""
Checks the exception info to see if it indicates a quarantine situation
(malformed or corrupted database). If not, the original exception will
be reraised. If so, the database will be quarantined and a new
sqlite3.DatabaseError will be raised indicating the action taken.
"""
if 'database disk image is malformed' in str(exc_value):
exc_hint = 'malformed'
elif 'file is encrypted or is not a database' in str(exc_value):
exc_hint = 'corrupted'
else:
raise exc_type, exc_value, exc_traceback
prefix_path = os.path.dirname(self.db_dir)
partition_path = os.path.dirname(prefix_path)
dbs_path = os.path.dirname(partition_path)
device_path = os.path.dirname(dbs_path)
quar_path = os.path.join(device_path, 'quarantined', self.db_type,
os.path.basename(self.db_dir))
try:
renamer(self.db_dir, quar_path)
except OSError, e:
if e.errno not in (errno.EEXIST, errno.ENOTEMPTY):
raise
quar_path = "%s-%s" % (quar_path, uuid4().hex)
renamer(self.db_dir, quar_path)
detail = _('Quarantined %s to %s due to %s database') % \
(self.db_dir, quar_path, exc_hint)
self.logger.error(detail)
raise sqlite3.DatabaseError(detail)
@contextmanager
def get(self):
"""Use with the "with" statement; returns a database connection."""
if not self.conn:
if self.db_file != ':memory:' and os.path.exists(self.db_file):
try:
self.conn = get_db_connection(self.db_file, self.timeout)
except (sqlite3.DatabaseError, DatabaseConnectionError):
self.possibly_quarantine(*sys.exc_info())
else:
raise DatabaseConnectionError(self.db_file, "DB doesn't exist")
conn = self.conn
@ -270,6 +305,12 @@ class DatabaseBroker(object):
yield conn
conn.rollback()
self.conn = conn
except sqlite3.DatabaseError, err:
try:
conn.close()
except:
pass
self.possibly_quarantine(*sys.exc_info())
except Exception:
conn.close()
raise

View File

@ -14,7 +14,7 @@
# limitations under the License.
import os
from os.path import isdir, join
from os.path import basename, dirname, isdir, join
import random
import shutil
import time
@ -22,6 +22,8 @@ import logging
import hashlib
import itertools
import cPickle as pickle
import errno
import uuid
import eventlet
from eventlet import GreenPool, tpool, Timeout, sleep, hubs
@ -30,7 +32,7 @@ from eventlet.support.greenlets import GreenletExit
from swift.common.ring import Ring
from swift.common.utils import whataremyips, unlink_older_than, lock_path, \
compute_eta, get_logger, write_pickle
compute_eta, get_logger, write_pickle, renamer
from swift.common.bufferedhttp import http_connect
from swift.common.daemon import Daemon
@ -41,6 +43,31 @@ ONE_WEEK = 604800
HASH_FILE = 'hashes.pkl'
def quarantine_renamer(device_path, corrupted_file_path):
"""
In the case that a file is corrupted, move it to a quarantined
area to allow replication to fix it.
:params device_path: The path to the device the corrupted file is on.
:params corrupted_file_path: The path to the file you want quarantined.
:returns: path (str) of directory the file was moved to
:raises OSError: re-raises non errno.EEXIST / errno.ENOTEMPTY
exceptions from rename
"""
from_dir = dirname(corrupted_file_path)
to_dir = join(device_path, 'quarantined', 'objects', basename(from_dir))
invalidate_hash(dirname(from_dir))
try:
renamer(from_dir, to_dir)
except OSError, e:
if e.errno not in (errno.EEXIST, errno.ENOTEMPTY):
raise
to_dir = "%s-%s" % (to_dir, uuid.uuid4().hex)
renamer(from_dir, to_dir)
return to_dir
def hash_suffix(path, reclaim_age):
"""
Performs reclamation and returns an md5 of all (remaining) files.
@ -50,7 +77,19 @@ def hash_suffix(path, reclaim_age):
md5 = hashlib.md5()
for hsh in sorted(os.listdir(path)):
hsh_path = join(path, hsh)
try:
files = os.listdir(hsh_path)
except OSError, err:
if err.errno == errno.ENOTDIR:
partition_path = dirname(path)
objects_path = dirname(partition_path)
device_path = dirname(objects_path)
quar_path = quarantine_renamer(device_path, hsh_path)
logging.exception(
_('Quarantined %s to %s because it is not a directory') %
(hsh_path, quar_path))
continue
raise
if len(files) == 1:
if files[0].endswith('.ts'):
# remove tombstones older than reclaim_age

View File

@ -21,7 +21,6 @@ import errno
import os
import time
import traceback
import uuid
from datetime import datetime
from hashlib import md5
from tempfile import mkstemp
@ -44,7 +43,8 @@ from swift.common.constraints import check_object_creation, check_mount, \
check_float, check_utf8
from swift.common.exceptions import ConnectionTimeout, DiskFileError, \
DiskFileNotExist
from swift.obj.replicator import tpooled_get_hashes, invalidate_hash
from swift.obj.replicator import tpooled_get_hashes, invalidate_hash, \
quarantine_renamer
DATADIR = 'objects'
@ -91,32 +91,6 @@ def write_metadata(fd, metadata):
key += 1
def quarantine_renamer(device_path, corrupted_file_path):
"""
In the case that a file is corrupted, move it to a quarantined
area to allow replication to fix it.
:params device_path: The path to the device the corrupted file is on.
:params corrupted_file_path: The path to the file you want quarantined.
:returns: path (str) of directory the file was moved to
:raises OSError: re-raises non errno.EEXIST / errno.ENOTEMPTY
exceptions from rename
"""
from_dir = os.path.dirname(corrupted_file_path)
to_dir = os.path.join(device_path, 'quarantined',
'objects', os.path.basename(from_dir))
invalidate_hash(os.path.dirname(from_dir))
try:
renamer(from_dir, to_dir)
except OSError, e:
if e.errno not in (errno.EEXIST, errno.ENOTEMPTY):
raise
to_dir = "%s-%s" % (to_dir, uuid.uuid4().hex)
renamer(from_dir, to_dir)
return to_dir
class DiskFile(object):
"""
Manage object files on disk.

Binary file not shown.

Binary file not shown.

View File

@ -19,7 +19,7 @@ from __future__ import with_statement
import hashlib
import os
import unittest
from shutil import rmtree
from shutil import rmtree, copy
from StringIO import StringIO
from time import sleep, time
from uuid import uuid4
@ -27,6 +27,7 @@ from uuid import uuid4
import simplejson
import sqlite3
import swift.common.db
from swift.common.db import AccountBroker, chexor, ContainerBroker, \
DatabaseBroker, DatabaseConnectionError, dict_factory, get_db_connection
from swift.common.utils import normalize_timestamp
@ -199,6 +200,47 @@ class TestDatabaseBroker(unittest.TestCase):
with broker.get() as conn:
self.assertEquals(
[r[0] for r in conn.execute('SELECT * FROM test')], ['1'])
orig_renamer = swift.common.db.renamer
try:
swift.common.db.renamer = lambda a, b: b
qpath = os.path.dirname(os.path.dirname(os.path.dirname(
os.path.dirname(self.testdir))))
if qpath:
qpath += '/quarantined/test/db'
else:
qpath = 'quarantined/test/db'
# Test malformed database
copy(os.path.join(os.path.dirname(__file__),
'malformed_example.db'),
os.path.join(self.testdir, '1.db'))
broker = DatabaseBroker(os.path.join(self.testdir, '1.db'))
broker.db_type = 'test'
exc = None
try:
with broker.get() as conn:
conn.execute('SELECT * FROM test')
except Exception, err:
exc = err
self.assertEquals(str(exc),
'Quarantined %s to %s due to malformed database' %
(self.testdir, qpath))
# Test corrupted database
copy(os.path.join(os.path.dirname(__file__),
'corrupted_example.db'),
os.path.join(self.testdir, '1.db'))
broker = DatabaseBroker(os.path.join(self.testdir, '1.db'))
broker.db_type = 'test'
exc = None
try:
with broker.get() as conn:
conn.execute('SELECT * FROM test')
except Exception, err:
exc = err
self.assertEquals(str(exc),
'Quarantined %s to %s due to corrupted database' %
(self.testdir, qpath))
finally:
swift.common.db.renamer = orig_renamer
def test_lock(self):
broker = DatabaseBroker(os.path.join(self.testdir, '1.db'), timeout=.1)

View File

@ -205,6 +205,27 @@ class TestObjectReplicator(unittest.TestCase):
self.assertEquals(hashed, 1)
self.assert_('a83' in hashes)
def test_hash_suffix_hash_dir_is_file_quarantine(self):
df = DiskFile(self.devices, 'sda', '0', 'a', 'c', 'o', FakeLogger())
mkdirs(os.path.dirname(df.datadir))
open(df.datadir, 'wb').close()
ohash = hash_path('a', 'c', 'o')
data_dir = ohash[-3:]
whole_path_from = os.path.join(self.objects, '0', data_dir)
orig_quarantine_renamer = object_replicator.quarantine_renamer
called = [False]
def wrapped(*args, **kwargs):
called[0] = True
return orig_quarantine_renamer(*args, **kwargs)
try:
object_replicator.quarantine_renamer = wrapped
object_replicator.hash_suffix(whole_path_from, 101)
finally:
object_replicator.quarantine_renamer = orig_quarantine_renamer
self.assertTrue(called[0])
def test_hash_suffix_one_file(self):
df = DiskFile(self.devices, 'sda', '0', 'a', 'c', 'o', FakeLogger())
mkdirs(df.datadir)