Added exception catchalls for auditors.

Also, changed some os.listdir calls to a new
swift.common.utils.listdir call which returns an empty list for
non-existent paths.

Change-Id: I89964636899d39bc07b1ecf2688786ffca67bf17
This commit is contained in:
gholt 2011-12-15 08:06:39 +00:00
parent dfb9a9f0a3
commit 2eb0eb510b
4 changed files with 56 additions and 40 deletions

View File

@ -44,9 +44,9 @@ class AccountAuditor(Daemon):
time.sleep(random() * self.interval) time.sleep(random() * self.interval)
while True: while True:
begin = time.time() begin = time.time()
try:
all_locs = audit_location_generator(self.devices, all_locs = audit_location_generator(self.devices,
account_server.DATADIR, account_server.DATADIR, mount_check=self.mount_check,
mount_check=self.mount_check,
logger=self.logger) logger=self.logger)
for path, device, partition in all_locs: for path, device, partition in all_locs:
self.account_audit(path) self.account_audit(path)
@ -59,6 +59,8 @@ class AccountAuditor(Daemon):
reported = time.time() reported = time.time()
self.account_passes = 0 self.account_passes = 0
self.account_failures = 0 self.account_failures = 0
except (Exception, Timeout):
self.logger.exception(_('ERROR auditing'))
elapsed = time.time() - begin elapsed = time.time() - begin
if elapsed < self.interval: if elapsed < self.interval:
time.sleep(self.interval - elapsed) time.sleep(self.interval - elapsed)

View File

@ -731,11 +731,11 @@ def iter_devices_partitions(devices_dir, item_type):
:param item_type: One of 'accounts', 'containers', or 'objects' :param item_type: One of 'accounts', 'containers', or 'objects'
:returns: Each iteration returns a tuple of (device, partition) :returns: Each iteration returns a tuple of (device, partition)
""" """
devices = os.listdir(devices_dir) devices = listdir(devices_dir)
shuffle(devices) shuffle(devices)
devices_partitions = [] devices_partitions = []
for device in devices: for device in devices:
partitions = os.listdir(os.path.join(devices_dir, device, item_type)) partitions = listdir(os.path.join(devices_dir, device, item_type))
shuffle(partitions) shuffle(partitions)
devices_partitions.append((device, iter(partitions))) devices_partitions.append((device, iter(partitions)))
yielded = True yielded = True
@ -757,7 +757,7 @@ def unlink_older_than(path, mtime):
:mtime: timestamp of oldest file to keep :mtime: timestamp of oldest file to keep
""" """
if os.path.exists(path): if os.path.exists(path):
for fname in os.listdir(path): for fname in listdir(path):
fpath = os.path.join(path, fname) fpath = os.path.join(path, fname)
try: try:
if os.path.getmtime(fpath) < mtime: if os.path.getmtime(fpath) < mtime:
@ -929,7 +929,7 @@ def audit_location_generator(devices, datadir, mount_check=True, logger=None):
on devices on devices
:param logger: a logger object :param logger: a logger object
''' '''
device_dir = os.listdir(devices) device_dir = listdir(devices)
# randomize devices in case of process restart before sweep completed # randomize devices in case of process restart before sweep completed
shuffle(device_dir) shuffle(device_dir)
for device in device_dir: for device in device_dir:
@ -942,22 +942,22 @@ def audit_location_generator(devices, datadir, mount_check=True, logger=None):
datadir_path = os.path.join(devices, device, datadir) datadir_path = os.path.join(devices, device, datadir)
if not os.path.exists(datadir_path): if not os.path.exists(datadir_path):
continue continue
partitions = os.listdir(datadir_path) partitions = listdir(datadir_path)
for partition in partitions: for partition in partitions:
part_path = os.path.join(datadir_path, partition) part_path = os.path.join(datadir_path, partition)
if not os.path.isdir(part_path): if not os.path.isdir(part_path):
continue continue
suffixes = os.listdir(part_path) suffixes = listdir(part_path)
for suffix in suffixes: for suffix in suffixes:
suff_path = os.path.join(part_path, suffix) suff_path = os.path.join(part_path, suffix)
if not os.path.isdir(suff_path): if not os.path.isdir(suff_path):
continue continue
hashes = os.listdir(suff_path) hashes = listdir(suff_path)
for hsh in hashes: for hsh in hashes:
hash_path = os.path.join(suff_path, hsh) hash_path = os.path.join(suff_path, hsh)
if not os.path.isdir(hash_path): if not os.path.isdir(hash_path):
continue continue
for fname in sorted(os.listdir(hash_path), for fname in sorted(listdir(hash_path),
reverse=True): reverse=True):
path = os.path.join(hash_path, fname) path = os.path.join(hash_path, fname)
yield path, device, partition yield path, device, partition
@ -1106,3 +1106,12 @@ def dump_recon_cache(cache_key, cache_value, cache_file, lock_timeout=2):
except OSError, err: except OSError, err:
if err.errno != errno.ENOENT: if err.errno != errno.ENOENT:
raise raise
def listdir(path):
try:
return os.listdir(path)
except OSError, err:
if err.errno != errno.ENOENT:
raise
return []

View File

@ -45,22 +45,24 @@ class ContainerAuditor(Daemon):
time.sleep(random() * self.interval) time.sleep(random() * self.interval)
while True: while True:
begin = time.time() begin = time.time()
try:
all_locs = audit_location_generator(self.devices, all_locs = audit_location_generator(self.devices,
container_server.DATADIR, container_server.DATADIR, mount_check=self.mount_check,
mount_check=self.mount_check,
logger=self.logger) logger=self.logger)
for path, device, partition in all_locs: for path, device, partition in all_locs:
self.container_audit(path) self.container_audit(path)
if time.time() - reported >= 3600: # once an hour if time.time() - reported >= 3600: # once an hour
self.logger.info( self.logger.info(
_('Since %(time)s: Container audits: %(pass)s passed ' _('Since %(time)s: Container audits: %(pass)s '
'audit, %(fail)s failed audit'), 'passed audit, %(fail)s failed audit'),
{'time': time.ctime(reported), {'time': time.ctime(reported),
'pass': self.container_passes, 'pass': self.container_passes,
'fail': self.container_failures}) 'fail': self.container_failures})
reported = time.time() reported = time.time()
self.container_passes = 0 self.container_passes = 0
self.container_failures = 0 self.container_failures = 0
except (Exception, Timeout):
self.logger.exception(_('ERROR auditing'))
elapsed = time.time() - begin elapsed = time.time() - begin
if elapsed < self.interval: if elapsed < self.interval:
time.sleep(self.interval - elapsed) time.sleep(self.interval - elapsed)

View File

@ -204,7 +204,10 @@ class ObjectAuditor(Daemon):
if parent: if parent:
kwargs['zero_byte_fps'] = zbo_fps or self.conf_zero_byte_fps kwargs['zero_byte_fps'] = zbo_fps or self.conf_zero_byte_fps
while True: while True:
try:
self.run_once(**kwargs) self.run_once(**kwargs)
except (Exception, Timeout):
self.logger.exception(_('ERROR auditing'))
self._sleep() self._sleep()
def run_once(self, *args, **kwargs): def run_once(self, *args, **kwargs):