swift/test/probe/test_dark_data.py

#!/usr/bin/python -u
# Copyright (c) 2010-2012 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import collections
import unittest

import os
import uuid
import shutil

from datetime import datetime
from six.moves.configparser import ConfigParser

from test.probe.brain import BrainSplitter
from test.probe.common import ReplProbeTest
from swift.common import manager
from swift.common.storage_policy import get_policy_string
from swift.common.manager import Manager, Server
from swift.common.utils import readconf


CONF_SECTION = 'object-auditor:watcher:swift#dark_data'


class TestDarkDataDeletion(ReplProbeTest):
    # NB: could be 'quarantine' in another test
    action = 'delete'

    def setUp(self):
        """
        Reset all environment and start all servers.
        """
        super(TestDarkDataDeletion, self).setUp()

        self.conf_dest = \
            os.path.join('/tmp/',
                         datetime.now().strftime('swift-%Y-%m-%d_%H-%M-%S-%f'))
        os.mkdir(self.conf_dest)

        object_server_dir = os.path.join(self.conf_dest, 'object-server')
        os.mkdir(object_server_dir)

        for conf_file in Server('object-auditor').conf_files():
            config = readconf(conf_file)
            if 'object-auditor' not in config:
                continue  # *somebody* should be set up to run the auditor
            config['object-auditor'].update(
                {'watchers': 'swift#dark_data'})
            # Note that this setdefault business may mean the watcher doesn't
            # pick up DEFAULT values, but that (probably?) won't matter.
            # We set grace_age to 0 so that tests don't have to deal with time.
            config.setdefault(CONF_SECTION, {}).update(
                {'action': self.action,
                 'grace_age': "0"})

            parser = ConfigParser()
            for section in ('object-auditor', CONF_SECTION):
                parser.add_section(section)
                for option, value in config[section].items():
                    parser.set(section, option, value)

            file_name = os.path.basename(conf_file)
            if file_name.endswith('.d'):
                # Work around conf.d setups (like you might see with VSAIO)
                file_name = file_name[:-2]
            with open(os.path.join(object_server_dir, file_name), 'w') as fp:
                parser.write(fp)

        self.container_name = 'container-%s' % uuid.uuid4()
        self.object_name = 'object-%s' % uuid.uuid4()
        self.brain = BrainSplitter(self.url, self.token, self.container_name,
                                   self.object_name, 'object',
                                   policy=self.policy)

    def tearDown(self):
        shutil.rmtree(self.conf_dest)

    def gather_object_files_by_ext(self):
        result = collections.defaultdict(set)
        for node in self.brain.nodes:
            for path, _, files in os.walk(os.path.join(
                    self.device_dir(node),
                    get_policy_string('objects', self.policy))):
                for file in files:
                    if file in ('.lock', 'hashes.pkl', 'hashes.invalid',
                                '.lock-replication'):
                        continue
                    _, ext = os.path.splitext(file)
                    result[ext].add(os.path.join(path, file))
        return result

    def test_dark_data(self):
        self.brain.put_container()
        self.brain.put_object()
        self.brain.stop_handoff_half()
        self.brain.delete_object()
        Manager(['object-updater']).once()
        Manager(['container-replicator']).once()

        # Sanity check:
        # * all containers are empty
        # * primaries that are still up have two .ts files
        # * primary that's down has one .data file
        for index, (headers, items) in self.direct_get_container(
                container=self.container_name).items():
            self.assertEqual(headers['X-Container-Object-Count'], '0')
            self.assertEqual(items, [])

        files = self.gather_object_files_by_ext()
        self.assertLengthEqual(files, 2)
        self.assertLengthEqual(files['.ts'], 2)
        self.assertLengthEqual(files['.data'], 1)

        # Simulate a reclaim_age passing,
        # so the tombstones all got cleaned up
        for file_path in files['.ts']:
            os.unlink(file_path)

        # Old node gets reintroduced to the cluster
        self.brain.start_handoff_half()
        # ...so replication thinks its got some work to do
        Manager(['object-replicator']).once()

        # Now we're back to *three* .data files
        files = self.gather_object_files_by_ext()
        self.assertLengthEqual(files, 1)
        self.assertLengthEqual(files['.data'], 3)

        # But that's OK, audit watchers to the rescue!
        old_swift_dir = manager.SWIFT_DIR
        manager.SWIFT_DIR = self.conf_dest
        try:
            Manager(['object-auditor']).once()
        finally:
            manager.SWIFT_DIR = old_swift_dir

        # Verify that the policy was applied.
        self.check_on_disk_files(files['.data'])

    def check_on_disk_files(self, files):
        for file_path in files:
            # File's not there
            self.assertFalse(os.path.exists(file_path))
            # And it's not quaratined, either!
            self.assertPathDoesNotExist(os.path.join(
                file_path[:file_path.index('objects')], 'quarantined'))

    def assertPathExists(self, path):
        msg = "Expected path %r to exist, but it doesn't" % path
        self.assertTrue(os.path.exists(path), msg)

    def assertPathDoesNotExist(self, path):
        msg = "Expected path %r to not exist, but it does" % path
        self.assertFalse(os.path.exists(path), msg)


class TestDarkDataQuarantining(TestDarkDataDeletion):
    action = 'quarantine'

    def check_on_disk_files(self, files):
        for file_path in files:
            # File's not there
            self.assertPathDoesNotExist(file_path)
            # Got quarantined
            parts = file_path.split(os.path.sep)
            policy_dir = get_policy_string('objects', self.policy)
            quarantine_dir = parts[:parts.index(policy_dir)] + ['quarantined']
            quarantine_path = os.path.sep.join(
                quarantine_dir + [policy_dir] + parts[-2:])
            self.assertPathExists(quarantine_path)


if __name__ == "__main__":
    unittest.main()
Let developers/operators add watchers to object audit Swift operators may find it useful to operate on each object in their cluster in some way. This commit provides them a way to hook into the object auditor with a simple, clearly-defined boundary so that they can iterate over their objects without additional disk IO. For example, a cluster operator may want to ensure a semantic consistency with all SLO segments accounted in their manifests, or locate objects that aren't in container listings. Now that Swift has encryption support, this could be used to locate unencrypted objects. The list goes on. This commit makes the auditor locate, via entry points, the watchers named in its config file. A watcher is a class with at least these four methods: __init__(self, conf, logger, kwargs) start(self, audit_type, kwargs) see_object(self, object_metadata, data_file_path, kwargs) end(self, kwargs) The auditor will call watcher.start(audit_type) at the start of an audit pass, watcher.see_object(...) for each object audited, and watcher.end() at the end of an audit pass. All method arguments are passed as keyword args. This version of the API is implemented on the context of the auditor itself, without spawning any additional processes. If the plugins are not working well -- hang, crash, or leak -- it's easier to debug them when there's no additional complication of processes that run by themselves. In addition, we include a reference implementation of plugin for the watcher API, as a help to plugin writers. Change-Id: I1be1faec53b2cdfaabf927598f1460e23c206b0a 2015-08-13 17:05:25 -05:00			`#!/usr/bin/python -u`
			`# Copyright (c) 2010-2012 OpenStack Foundation`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or`
			`# implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`import collections`
			`import unittest`

			`import os`
			`import uuid`
			`import shutil`

			`from datetime import datetime`
			`from six.moves.configparser import ConfigParser`

			`from test.probe.brain import BrainSplitter`
			`from test.probe.common import ReplProbeTest`
			`from swift.common import manager`
			`from swift.common.storage_policy import get_policy_string`
			`from swift.common.manager import Manager, Server`
			`from swift.common.utils import readconf`


			`CONF_SECTION = 'object-auditor:watcher:swift#dark_data'`


			`class TestDarkDataDeletion(ReplProbeTest):`
			`# NB: could be 'quarantine' in another test`
			`action = 'delete'`

			`def setUp(self):`
			`"""`
			`Reset all environment and start all servers.`
			`"""`
			`super(TestDarkDataDeletion, self).setUp()`

			`self.conf_dest = \`
			`os.path.join('/tmp/',`
			`datetime.now().strftime('swift-%Y-%m-%d_%H-%M-%S-%f'))`
			`os.mkdir(self.conf_dest)`

			`object_server_dir = os.path.join(self.conf_dest, 'object-server')`
			`os.mkdir(object_server_dir)`

			`for conf_file in Server('object-auditor').conf_files():`
			`config = readconf(conf_file)`
			`if 'object-auditor' not in config:`
			`continue # somebody should be set up to run the auditor`
			`config['object-auditor'].update(`
			`{'watchers': 'swift#dark_data'})`
			`# Note that this setdefault business may mean the watcher doesn't`
Make dark data watcher ignore the newly updated objects When objects are freshly uploaded, they may take a little time to appear in container listings, producing false positives. Because we needed to test this, we also reworked/added the tests and fixed some issues, including adding an EC fragment (thanks to Alistair's code). Closes-Bug: 1925782 Change-Id: Ieafa72a496328f7a487ca7062da6253994a5a07d Co-Authored-By: Alistair Coles <alistairncoles@gmail.com> 2021-04-27 22:15:56 -05:00			`# pick up DEFAULT values, but that (probably?) won't matter.`
			`# We set grace_age to 0 so that tests don't have to deal with time.`
Let developers/operators add watchers to object audit Swift operators may find it useful to operate on each object in their cluster in some way. This commit provides them a way to hook into the object auditor with a simple, clearly-defined boundary so that they can iterate over their objects without additional disk IO. For example, a cluster operator may want to ensure a semantic consistency with all SLO segments accounted in their manifests, or locate objects that aren't in container listings. Now that Swift has encryption support, this could be used to locate unencrypted objects. The list goes on. This commit makes the auditor locate, via entry points, the watchers named in its config file. A watcher is a class with at least these four methods: __init__(self, conf, logger, kwargs) start(self, audit_type, kwargs) see_object(self, object_metadata, data_file_path, kwargs) end(self, kwargs) The auditor will call watcher.start(audit_type) at the start of an audit pass, watcher.see_object(...) for each object audited, and watcher.end() at the end of an audit pass. All method arguments are passed as keyword args. This version of the API is implemented on the context of the auditor itself, without spawning any additional processes. If the plugins are not working well -- hang, crash, or leak -- it's easier to debug them when there's no additional complication of processes that run by themselves. In addition, we include a reference implementation of plugin for the watcher API, as a help to plugin writers. Change-Id: I1be1faec53b2cdfaabf927598f1460e23c206b0a 2015-08-13 17:05:25 -05:00			`config.setdefault(CONF_SECTION, {}).update(`
Make dark data watcher ignore the newly updated objects When objects are freshly uploaded, they may take a little time to appear in container listings, producing false positives. Because we needed to test this, we also reworked/added the tests and fixed some issues, including adding an EC fragment (thanks to Alistair's code). Closes-Bug: 1925782 Change-Id: Ieafa72a496328f7a487ca7062da6253994a5a07d Co-Authored-By: Alistair Coles <alistairncoles@gmail.com> 2021-04-27 22:15:56 -05:00			`{'action': self.action,`
			`'grace_age': "0"})`
Let developers/operators add watchers to object audit Swift operators may find it useful to operate on each object in their cluster in some way. This commit provides them a way to hook into the object auditor with a simple, clearly-defined boundary so that they can iterate over their objects without additional disk IO. For example, a cluster operator may want to ensure a semantic consistency with all SLO segments accounted in their manifests, or locate objects that aren't in container listings. Now that Swift has encryption support, this could be used to locate unencrypted objects. The list goes on. This commit makes the auditor locate, via entry points, the watchers named in its config file. A watcher is a class with at least these four methods: __init__(self, conf, logger, kwargs) start(self, audit_type, kwargs) see_object(self, object_metadata, data_file_path, kwargs) end(self, kwargs) The auditor will call watcher.start(audit_type) at the start of an audit pass, watcher.see_object(...) for each object audited, and watcher.end() at the end of an audit pass. All method arguments are passed as keyword args. This version of the API is implemented on the context of the auditor itself, without spawning any additional processes. If the plugins are not working well -- hang, crash, or leak -- it's easier to debug them when there's no additional complication of processes that run by themselves. In addition, we include a reference implementation of plugin for the watcher API, as a help to plugin writers. Change-Id: I1be1faec53b2cdfaabf927598f1460e23c206b0a 2015-08-13 17:05:25 -05:00
			`parser = ConfigParser()`
			`for section in ('object-auditor', CONF_SECTION):`
			`parser.add_section(section)`
			`for option, value in config[section].items():`
			`parser.set(section, option, value)`

			`file_name = os.path.basename(conf_file)`
			`if file_name.endswith('.d'):`
			`# Work around conf.d setups (like you might see with VSAIO)`
			`file_name = file_name[:-2]`
			`with open(os.path.join(object_server_dir, file_name), 'w') as fp:`
			`parser.write(fp)`

			`self.container_name = 'container-%s' % uuid.uuid4()`
			`self.object_name = 'object-%s' % uuid.uuid4()`
			`self.brain = BrainSplitter(self.url, self.token, self.container_name,`
			`self.object_name, 'object',`
			`policy=self.policy)`

			`def tearDown(self):`
			`shutil.rmtree(self.conf_dest)`

			`def gather_object_files_by_ext(self):`
			`result = collections.defaultdict(set)`
			`for node in self.brain.nodes:`
			`for path, _, files in os.walk(os.path.join(`
			`self.device_dir(node),`
			`get_policy_string('objects', self.policy))):`
			`for file in files:`
tests: Make dark data probe tests pass with sync_method = ssync Change-Id: Ic94761e435d85a7fe4bd17a7d341b1655b98b3ff 2023-05-17 12:00:49 -07:00			`if file in ('.lock', 'hashes.pkl', 'hashes.invalid',`
			`'.lock-replication'):`
Let developers/operators add watchers to object audit Swift operators may find it useful to operate on each object in their cluster in some way. This commit provides them a way to hook into the object auditor with a simple, clearly-defined boundary so that they can iterate over their objects without additional disk IO. For example, a cluster operator may want to ensure a semantic consistency with all SLO segments accounted in their manifests, or locate objects that aren't in container listings. Now that Swift has encryption support, this could be used to locate unencrypted objects. The list goes on. This commit makes the auditor locate, via entry points, the watchers named in its config file. A watcher is a class with at least these four methods: __init__(self, conf, logger, kwargs) start(self, audit_type, kwargs) see_object(self, object_metadata, data_file_path, kwargs) end(self, kwargs) The auditor will call watcher.start(audit_type) at the start of an audit pass, watcher.see_object(...) for each object audited, and watcher.end() at the end of an audit pass. All method arguments are passed as keyword args. This version of the API is implemented on the context of the auditor itself, without spawning any additional processes. If the plugins are not working well -- hang, crash, or leak -- it's easier to debug them when there's no additional complication of processes that run by themselves. In addition, we include a reference implementation of plugin for the watcher API, as a help to plugin writers. Change-Id: I1be1faec53b2cdfaabf927598f1460e23c206b0a 2015-08-13 17:05:25 -05:00			`continue`
			`_, ext = os.path.splitext(file)`
			`result[ext].add(os.path.join(path, file))`
			`return result`

			`def test_dark_data(self):`
			`self.brain.put_container()`
			`self.brain.put_object()`
			`self.brain.stop_handoff_half()`
			`self.brain.delete_object()`
			`Manager(['object-updater']).once()`
			`Manager(['container-replicator']).once()`

			`# Sanity check:`
			`# * all containers are empty`
			`# * primaries that are still up have two .ts files`
			`# * primary that's down has one .data file`
			`for index, (headers, items) in self.direct_get_container(`
			`container=self.container_name).items():`
			`self.assertEqual(headers['X-Container-Object-Count'], '0')`
			`self.assertEqual(items, [])`

			`files = self.gather_object_files_by_ext()`
			`self.assertLengthEqual(files, 2)`
			`self.assertLengthEqual(files['.ts'], 2)`
			`self.assertLengthEqual(files['.data'], 1)`

			`# Simulate a reclaim_age passing,`
			`# so the tombstones all got cleaned up`
			`for file_path in files['.ts']:`
			`os.unlink(file_path)`

			`# Old node gets reintroduced to the cluster`
			`self.brain.start_handoff_half()`
			`# ...so replication thinks its got some work to do`
			`Manager(['object-replicator']).once()`

			`# Now we're back to three .data files`
			`files = self.gather_object_files_by_ext()`
			`self.assertLengthEqual(files, 1)`
			`self.assertLengthEqual(files['.data'], 3)`

			`# But that's OK, audit watchers to the rescue!`
			`old_swift_dir = manager.SWIFT_DIR`
			`manager.SWIFT_DIR = self.conf_dest`
			`try:`
			`Manager(['object-auditor']).once()`
			`finally:`
			`manager.SWIFT_DIR = old_swift_dir`

			`# Verify that the policy was applied.`
			`self.check_on_disk_files(files['.data'])`

			`def check_on_disk_files(self, files):`
			`for file_path in files:`
			`# File's not there`
			`self.assertFalse(os.path.exists(file_path))`
			`# And it's not quaratined, either!`
			`self.assertPathDoesNotExist(os.path.join(`
			`file_path[:file_path.index('objects')], 'quarantined'))`

			`def assertPathExists(self, path):`
			`msg = "Expected path %r to exist, but it doesn't" % path`
			`self.assertTrue(os.path.exists(path), msg)`

			`def assertPathDoesNotExist(self, path):`
			`msg = "Expected path %r to not exist, but it does" % path`
			`self.assertFalse(os.path.exists(path), msg)`


			`class TestDarkDataQuarantining(TestDarkDataDeletion):`
			`action = 'quarantine'`

			`def check_on_disk_files(self, files):`
			`for file_path in files:`
			`# File's not there`
			`self.assertPathDoesNotExist(file_path)`
			`# Got quarantined`
			`parts = file_path.split(os.path.sep)`
Get TestDarkDataQuarantining passing when policy-0 is erasure-coded Change-Id: I6459eb69e81fddf99249e650f4778ccf7a4f1169 2021-04-22 16:59:19 -07:00			`policy_dir = get_policy_string('objects', self.policy)`
			`quarantine_dir = parts[:parts.index(policy_dir)] + ['quarantined']`
Let developers/operators add watchers to object audit Swift operators may find it useful to operate on each object in their cluster in some way. This commit provides them a way to hook into the object auditor with a simple, clearly-defined boundary so that they can iterate over their objects without additional disk IO. For example, a cluster operator may want to ensure a semantic consistency with all SLO segments accounted in their manifests, or locate objects that aren't in container listings. Now that Swift has encryption support, this could be used to locate unencrypted objects. The list goes on. This commit makes the auditor locate, via entry points, the watchers named in its config file. A watcher is a class with at least these four methods: __init__(self, conf, logger, kwargs) start(self, audit_type, kwargs) see_object(self, object_metadata, data_file_path, kwargs) end(self, kwargs) The auditor will call watcher.start(audit_type) at the start of an audit pass, watcher.see_object(...) for each object audited, and watcher.end() at the end of an audit pass. All method arguments are passed as keyword args. This version of the API is implemented on the context of the auditor itself, without spawning any additional processes. If the plugins are not working well -- hang, crash, or leak -- it's easier to debug them when there's no additional complication of processes that run by themselves. In addition, we include a reference implementation of plugin for the watcher API, as a help to plugin writers. Change-Id: I1be1faec53b2cdfaabf927598f1460e23c206b0a 2015-08-13 17:05:25 -05:00			`quarantine_path = os.path.sep.join(`
Get TestDarkDataQuarantining passing when policy-0 is erasure-coded Change-Id: I6459eb69e81fddf99249e650f4778ccf7a4f1169 2021-04-22 16:59:19 -07:00			`quarantine_dir + [policy_dir] + parts[-2:])`
Let developers/operators add watchers to object audit Swift operators may find it useful to operate on each object in their cluster in some way. This commit provides them a way to hook into the object auditor with a simple, clearly-defined boundary so that they can iterate over their objects without additional disk IO. For example, a cluster operator may want to ensure a semantic consistency with all SLO segments accounted in their manifests, or locate objects that aren't in container listings. Now that Swift has encryption support, this could be used to locate unencrypted objects. The list goes on. This commit makes the auditor locate, via entry points, the watchers named in its config file. A watcher is a class with at least these four methods: __init__(self, conf, logger, kwargs) start(self, audit_type, kwargs) see_object(self, object_metadata, data_file_path, kwargs) end(self, kwargs) The auditor will call watcher.start(audit_type) at the start of an audit pass, watcher.see_object(...) for each object audited, and watcher.end() at the end of an audit pass. All method arguments are passed as keyword args. This version of the API is implemented on the context of the auditor itself, without spawning any additional processes. If the plugins are not working well -- hang, crash, or leak -- it's easier to debug them when there's no additional complication of processes that run by themselves. In addition, we include a reference implementation of plugin for the watcher API, as a help to plugin writers. Change-Id: I1be1faec53b2cdfaabf927598f1460e23c206b0a 2015-08-13 17:05:25 -05:00			`self.assertPathExists(quarantine_path)`


			`if __name__ == "__main__":`
			`unittest.main()`