From 22d6fe98a3f437709901fac4e4ec65fec414f7d0 Mon Sep 17 00:00:00 2001 From: Goutham Pacha Ravi Date: Fri, 7 Aug 2020 12:38:05 -0700 Subject: [PATCH] Fix capacity calculations in the CephFS driver The driver inflated total and available capacity due to an incorrect calculation. The driver was also ignoring the configuration option "reserved_share_percentage" that allows deployers to set aside space from scheduling to prevent oversubscription. While this bugfix may have an upgrade impact, some things must be clarified: - Inflating the total, free space will allow manila to schedule workloads that may run out of space - this may cause end user downtime and frustration, because shares are created (empty subvolumes on ceph occupy no space) easily, but they could get throttled as they start to fill up. - CephFS shares are always thinly provisioned but, the driver does not support oversubscription via manila. So, real free space is what determines capacity based scheduler decisions. Users however expect share sizes to be honored, and manila will allow provisioning as long as there is free space on the cluster. This means that Ceph cluster administrators must manage oversubscription outside of manila to prevent misbehavior. Depends-On: Ic96b65d2caab788afca8bfc45575f3c05dc88008 Change-Id: I6ab157d6d099fe910ec1d90193783b55053ce8f6 Closes-Bug: #1890833 Signed-off-by: Goutham Pacha Ravi --- doc/source/admin/cephfs_driver.rst | 16 ++++++++++++++ manila/share/drivers/cephfs/driver.py | 7 +++--- .../tests/share/drivers/cephfs/test_driver.py | 13 ++++++++--- ...rect-capacity-report-3a9bdaffcc62ec71.yaml | 22 +++++++++++++++++++ 4 files changed, 52 insertions(+), 6 deletions(-) create mode 100644 releasenotes/notes/bug-1890833-fix-cephfs-incorrect-capacity-report-3a9bdaffcc62ec71.yaml diff --git a/doc/source/admin/cephfs_driver.rst b/doc/source/admin/cephfs_driver.rst index 07650be21e..e08ac6eb75 100644 --- a/doc/source/admin/cephfs_driver.rst +++ b/doc/source/admin/cephfs_driver.rst @@ -314,6 +314,22 @@ using the section name, ``cephfsnfs1``. enabled_share_backends = generic1, cephfsnfs1 +Space considerations +~~~~~~~~~~~~~~~~~~~~ + +The CephFS driver reports total and free capacity available across the Ceph +cluster to manila to allow provisioning. All CephFS shares are thinly +provisioned, i.e., empty shares do not consume any significant space +on the cluster. The CephFS driver does not allow controlling oversubscription +via manila. So, as long as there is free space, provisioning will continue, +and eventually this may cause your Ceph cluster to be over provisioned and +you may run out of space if shares are being filled to capacity. It is advised +that you use Ceph's monitoring tools to monitor space usage and add more +storage when required in order to honor space requirements for provisioned +manila shares. You may use the driver configuration option +``reserved_share_percentage`` to prevent manila from filling up your Ceph +cluster, and allow existing shares to grow. + Creating shares ~~~~~~~~~~~~~~~ diff --git a/manila/share/drivers/cephfs/driver.py b/manila/share/drivers/cephfs/driver.py index 2b821bd41c..42cfa38c47 100644 --- a/manila/share/drivers/cephfs/driver.py +++ b/manila/share/drivers/cephfs/driver.py @@ -167,8 +167,8 @@ class CephFSDriver(driver.ExecuteMixin, driver.GaneshaMixin, def _update_share_stats(self): stats = self.volume_client.rados.get_cluster_stats() - total_capacity_gb = stats['kb'] * units.Mi - free_capacity_gb = stats['kb_avail'] * units.Mi + total_capacity_gb = round(stats['kb'] / units.Mi, 2) + free_capacity_gb = round(stats['kb_avail'] / units.Mi, 2) data = { 'vendor_name': 'Ceph', @@ -182,7 +182,8 @@ class CephFSDriver(driver.ExecuteMixin, driver.GaneshaMixin, 'total_capacity_gb': total_capacity_gb, 'free_capacity_gb': free_capacity_gb, 'qos': 'False', - 'reserved_percentage': 0, + 'reserved_percentage': self.configuration.safe_get( + 'reserved_share_percentage'), 'dedupe': [False], 'compression': [False], 'thin_provisioning': [False] diff --git a/manila/tests/share/drivers/cephfs/test_driver.py b/manila/tests/share/drivers/cephfs/test_driver.py index 854cf9686b..ab52a73191 100644 --- a/manila/tests/share/drivers/cephfs/test_driver.py +++ b/manila/tests/share/drivers/cephfs/test_driver.py @@ -75,8 +75,10 @@ class MockVolumeClientModule(object): self.get_used_bytes = mock.Mock(return_value=self.mock_used_bytes) self.rados = mock.Mock() self.rados.get_cluster_stats = mock.Mock(return_value={ - "kb": 1000, - "kb_avail": 500 + "kb": 172953600, + "kb_avail": 157123584, + "kb_used": 15830016, + "num_objects": 26, }) @@ -352,10 +354,15 @@ class CephFSDriverTestCase(test.TestCase): def test_update_share_stats(self): self._driver.get_configured_ip_versions = mock.Mock(return_value=[4]) - self._driver._volume_client + self._driver.configuration.local_conf.set_override( + 'reserved_share_percentage', 5) + self._driver._update_share_stats() result = self._driver._stats + self.assertEqual(5, result['pools'][0]['reserved_percentage']) + self.assertEqual(164.94, result['pools'][0]['total_capacity_gb']) + self.assertEqual(149.84, result['pools'][0]['free_capacity_gb']) self.assertTrue(result['ipv4_support']) self.assertFalse(result['ipv6_support']) self.assertEqual("CEPHFS", result['storage_protocol']) diff --git a/releasenotes/notes/bug-1890833-fix-cephfs-incorrect-capacity-report-3a9bdaffcc62ec71.yaml b/releasenotes/notes/bug-1890833-fix-cephfs-incorrect-capacity-report-3a9bdaffcc62ec71.yaml new file mode 100644 index 0000000000..7417cc930f --- /dev/null +++ b/releasenotes/notes/bug-1890833-fix-cephfs-incorrect-capacity-report-3a9bdaffcc62ec71.yaml @@ -0,0 +1,22 @@ +--- +upgrade: + - | + This version includes a fix to the CephFS drivers to address `an issue + `_ with total and free space calculation + in the CephFS driver. When you update, you will notice that the space + calculations reflect reality in your Ceph clusters, and provisioning may + fail if the share sizes exceed the cluster's free space. CephFS shares are + always thin provisioned, and the driver does not support oversubscription + via Manila; so space can be claimed for new shares as long as there is free + space on the cluster. Use the "reserved_share_percentage" back end + configuration option to ensure there's always space left aside for + provisioned workloads to grow over time. +fixes: + - | + The CephFS driver has now been fixed to report total and available space on + the storage system correctly. See `Launchpad bug#1890833 + `_ for more details. + - | + The CephFS driver now honors the configuration option + "reserved_share_percentage", and it can be used to prevent save + space for provisioned workloads to grow over time.