arq bind and unbound support vGPU
This patch is part of the vGPU support feature in cyborg. It implements arq bind and unbind for vGPU resource. Co-Authored-By: Wenping Song <songwenping@inspur.com> Change-Id: I32c3b81345c6ce83834a83c64b88e37926724f16
This commit is contained in:
parent
79e1928554
commit
4b34d897d2
@ -19,6 +19,7 @@ from oslo_log import log as logging
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
import cyborg.common.exception as exception
|
||||||
import cyborg.conf
|
import cyborg.conf
|
||||||
import cyborg.privsep
|
import cyborg.privsep
|
||||||
|
|
||||||
@ -41,6 +42,27 @@ def lspci_privileged():
|
|||||||
return processutils.execute(*cmd)
|
return processutils.execute(*cmd)
|
||||||
|
|
||||||
|
|
||||||
|
@cyborg.privsep.sys_admin_pctxt.entrypoint
|
||||||
|
def create_mdev_privileged(pci_addr, mdev_type, ah_uuid):
|
||||||
|
"""Instantiate a mediated device."""
|
||||||
|
if ah_uuid is None:
|
||||||
|
raise exception.AttachHandleUUIDNeeded()
|
||||||
|
fpath = '/sys/class/mdev_bus/{0}/mdev_supported_types/{1}/create'
|
||||||
|
fpath = fpath.format(pci_addr, mdev_type)
|
||||||
|
with open(fpath, 'w') as f:
|
||||||
|
f.write(ah_uuid)
|
||||||
|
return ah_uuid
|
||||||
|
|
||||||
|
|
||||||
|
@cyborg.privsep.sys_admin_pctxt.entrypoint
|
||||||
|
def remove_mdev_privileged(physical_device, mdev_type, medv_uuid):
|
||||||
|
fpath = ('/sys/class/mdev_bus/{0}/mdev_supported_types/'
|
||||||
|
'{1}/devices/{2}/remove')
|
||||||
|
fpath = fpath.format(physical_device, mdev_type, medv_uuid)
|
||||||
|
with open(fpath, 'w') as f:
|
||||||
|
f.write("1")
|
||||||
|
|
||||||
|
|
||||||
def get_pci_devices(pci_flags, vendor_id=None):
|
def get_pci_devices(pci_flags, vendor_id=None):
|
||||||
device_for_vendor_out = []
|
device_for_vendor_out = []
|
||||||
all_device_out = []
|
all_device_out = []
|
||||||
|
@ -21,6 +21,7 @@ from oslo_service import periodic_task
|
|||||||
from oslo_utils import uuidutils
|
from oslo_utils import uuidutils
|
||||||
|
|
||||||
from cyborg.accelerator.drivers.fpga.base import FPGADriver
|
from cyborg.accelerator.drivers.fpga.base import FPGADriver
|
||||||
|
from cyborg.accelerator.drivers.gpu import utils as gpu_utils
|
||||||
from cyborg.agent.resource_tracker import ResourceTracker
|
from cyborg.agent.resource_tracker import ResourceTracker
|
||||||
from cyborg.agent.rpcapi import AgentAPI
|
from cyborg.agent.rpcapi import AgentAPI
|
||||||
from cyborg.common import exception
|
from cyborg.common import exception
|
||||||
@ -80,3 +81,11 @@ class AgentManager(periodic_task.PeriodicTasks):
|
|||||||
def update_available_resource(self, context, startup=True):
|
def update_available_resource(self, context, startup=True):
|
||||||
"""Update all kinds of accelerator resources from their drivers."""
|
"""Update all kinds of accelerator resources from their drivers."""
|
||||||
self._rt.update_usage(context)
|
self._rt.update_usage(context)
|
||||||
|
|
||||||
|
def create_vgpu_mdev(self, context, pci_addr, asked_type, ah_uuid):
|
||||||
|
LOG.debug('Instantiate a mediated device')
|
||||||
|
gpu_utils.create_mdev_privileged(pci_addr, asked_type, ah_uuid)
|
||||||
|
|
||||||
|
def remove_vgpu_mdev(self, context, pci_addr, asked_type, ah_uuid):
|
||||||
|
LOG.debug('Remove a vgpu mdev')
|
||||||
|
gpu_utils.remove_mdev_privileged(pci_addr, asked_type, ah_uuid)
|
||||||
|
@ -61,3 +61,25 @@ class AgentAPI(object):
|
|||||||
controlpath_id=controlpath_id,
|
controlpath_id=controlpath_id,
|
||||||
bitstream_uuid=bitstream_uuid,
|
bitstream_uuid=bitstream_uuid,
|
||||||
driver_name=driver_name)
|
driver_name=driver_name)
|
||||||
|
|
||||||
|
def create_vgpu_mdev(self, context, hostname, pci_addr,
|
||||||
|
asked_type, ah_uuid):
|
||||||
|
LOG.debug('Agent create_vgpu_mdev: hostname: (%s) , pci_address: (%s)'
|
||||||
|
'gpu_id: (%s)', hostname, pci_addr, ah_uuid)
|
||||||
|
version = '1.0'
|
||||||
|
cctxt = self.client.prepare(server=hostname, version=version)
|
||||||
|
return cctxt.call(context, 'create_vgpu_mdev',
|
||||||
|
pci_addr=pci_addr,
|
||||||
|
asked_type=asked_type,
|
||||||
|
ah_uuid=ah_uuid)
|
||||||
|
|
||||||
|
def remove_vgpu_mdev(self, context, hostname, pci_addr,
|
||||||
|
asked_type, ah_uuid):
|
||||||
|
LOG.debug('Agent remove_vgpu_mdev: hostname: (%s) '
|
||||||
|
'gpu_id: (%s)', hostname, ah_uuid)
|
||||||
|
version = '1.0'
|
||||||
|
cctxt = self.client.prepare(server=hostname, version=version)
|
||||||
|
return cctxt.call(context, 'remove_vgpu_mdev',
|
||||||
|
pci_addr=pci_addr,
|
||||||
|
asked_type=asked_type,
|
||||||
|
ah_uuid=ah_uuid)
|
||||||
|
@ -60,6 +60,7 @@ class ARQ(base.APIBase):
|
|||||||
"""The UUID of the instance project_id associated with this ARQ, if any"""
|
"""The UUID of the instance project_id associated with this ARQ, if any"""
|
||||||
|
|
||||||
attach_handle_type = wtypes.text
|
attach_handle_type = wtypes.text
|
||||||
|
attach_handle_uuid = wtypes.text
|
||||||
attach_handle_info = {wtypes.text: wtypes.text}
|
attach_handle_info = {wtypes.text: wtypes.text}
|
||||||
|
|
||||||
links = wsme.wsattr([link.Link], readonly=True)
|
links = wsme.wsattr([link.Link], readonly=True)
|
||||||
|
@ -92,6 +92,10 @@ class AttachHandleAlreadyExists(CyborgException):
|
|||||||
_msg_fmt = _("AttachHandle with uuid %(uuid)s already exists.")
|
_msg_fmt = _("AttachHandle with uuid %(uuid)s already exists.")
|
||||||
|
|
||||||
|
|
||||||
|
class AttachHandleUUIDNeeded(CyborgException):
|
||||||
|
_msg_fmt = _("Need to provide AttachHandle uuid.")
|
||||||
|
|
||||||
|
|
||||||
class ControlpathIDAlreadyExists(CyborgException):
|
class ControlpathIDAlreadyExists(CyborgException):
|
||||||
_msg_fmt = _("ControlpathID with uuid %(uuid)s already exists.")
|
_msg_fmt = _("ControlpathID with uuid %(uuid)s already exists.")
|
||||||
|
|
||||||
|
@ -370,7 +370,7 @@ class ConductorManager(object):
|
|||||||
"resource_providers?name=" + hostname).json()
|
"resource_providers?name=" + hostname).json()
|
||||||
pr_uuid = provider["resource_providers"][0]["uuid"]
|
pr_uuid = provider["resource_providers"][0]["uuid"]
|
||||||
return pr_uuid
|
return pr_uuid
|
||||||
except IndexError:
|
except (IndexError, KeyError):
|
||||||
raise exception.PlacementResourceProviderNotFound(
|
raise exception.PlacementResourceProviderNotFound(
|
||||||
resource_provider=hostname)
|
resource_provider=hostname)
|
||||||
|
|
||||||
|
@ -48,6 +48,7 @@ class ARQ(base.CyborgObject, object_base.VersionedObjectDictCompat):
|
|||||||
|
|
||||||
# Fields populated by Cyborg after binding
|
# Fields populated by Cyborg after binding
|
||||||
'attach_handle_type': object_fields.StringField(nullable=True),
|
'attach_handle_type': object_fields.StringField(nullable=True),
|
||||||
|
'attach_handle_uuid': object_fields.StringField(nullable=True),
|
||||||
'attach_handle_info': object_fields.DictOfStringsField(nullable=True),
|
'attach_handle_info': object_fields.DictOfStringsField(nullable=True),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,11 +13,14 @@
|
|||||||
# License for the specific language governing permissions and limitations
|
# License for the specific language governing permissions and limitations
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
from openstack import connection
|
from openstack import connection
|
||||||
from oslo_log import log as logging
|
from oslo_log import log as logging
|
||||||
from oslo_utils import versionutils
|
from oslo_utils import versionutils
|
||||||
from oslo_versionedobjects import base as object_base
|
from oslo_versionedobjects import base as object_base
|
||||||
|
|
||||||
|
from cyborg.agent.rpcapi import AgentAPI
|
||||||
from cyborg.common import constants
|
from cyborg.common import constants
|
||||||
from cyborg.common.constants import ARQ_STATES_TRANSFORM_MATRIX
|
from cyborg.common.constants import ARQ_STATES_TRANSFORM_MATRIX
|
||||||
from cyborg.common import exception
|
from cyborg.common import exception
|
||||||
@ -78,6 +81,10 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat,
|
|||||||
if target_version < (1, 2) and 'deployable_id' in primitive:
|
if target_version < (1, 2) and 'deployable_id' in primitive:
|
||||||
del primitive['deployable_id']
|
del primitive['deployable_id']
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(ExtARQ, self).__init__(*args, **kwargs)
|
||||||
|
self.agent = AgentAPI()
|
||||||
|
|
||||||
def create(self, context, device_profile_id=None):
|
def create(self, context, device_profile_id=None):
|
||||||
"""Create an ExtARQ record in the DB."""
|
"""Create an ExtARQ record in the DB."""
|
||||||
if 'device_profile_name' not in self.arq and not device_profile_id:
|
if 'device_profile_name' not in self.arq and not device_profile_id:
|
||||||
@ -213,6 +220,16 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat,
|
|||||||
try:
|
try:
|
||||||
ah = AttachHandle.allocate(context, deployable.id)
|
ah = AttachHandle.allocate(context, deployable.id)
|
||||||
self.attach_handle_id = ah.id
|
self.attach_handle_id = ah.id
|
||||||
|
# if attach_handle is a vgpu, create the mdev in the sys path
|
||||||
|
if ah.attach_type == 'MDEV':
|
||||||
|
attach_info = json.loads(ah.attach_info)
|
||||||
|
pci_addr = "{}:{}:{}.{}".format(
|
||||||
|
attach_info['domain'], attach_info['bus'],
|
||||||
|
attach_info['device'], attach_info['function'])
|
||||||
|
hostname = self.arq.hostname
|
||||||
|
asked_type = attach_info['asked_type']
|
||||||
|
self.agent.create_vgpu_mdev(
|
||||||
|
context, hostname, pci_addr, asked_type, ah.uuid)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
LOG.error("Failed to allocate attach handle for ARQ %s"
|
LOG.error("Failed to allocate attach handle for ARQ %s"
|
||||||
"from deployable %s. Reason: %s",
|
"from deployable %s. Reason: %s",
|
||||||
@ -237,9 +254,17 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat,
|
|||||||
# if (self.arq.state == constants.ARQ_DELETING
|
# if (self.arq.state == constants.ARQ_DELETING
|
||||||
# or self.arq.state == ARQ_UNBOUND):
|
# or self.arq.state == ARQ_UNBOUND):
|
||||||
|
|
||||||
def _deallocate_attach_handle(self, context, ah_id):
|
def _deallocate_attach_handle(self, context, ah_id, hostname):
|
||||||
try:
|
try:
|
||||||
attach_handle = AttachHandle.get_by_id(context, ah_id)
|
attach_handle = AttachHandle.get_by_id(context, ah_id)
|
||||||
|
if attach_handle.attach_type == 'MDEV':
|
||||||
|
attach_info = json.loads(attach_handle.attach_info)
|
||||||
|
pci_addr = "{}:{}:{}.{}".format(
|
||||||
|
attach_info['domain'], attach_info['bus'],
|
||||||
|
attach_info['device'], attach_info['function'])
|
||||||
|
self.agent.remove_vgpu_mdev(
|
||||||
|
context, hostname, pci_addr,
|
||||||
|
attach_info['asked_type'], attach_handle.uuid)
|
||||||
attach_handle.deallocate(context)
|
attach_handle.deallocate(context)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
LOG.error("Failed to deallocate attach handle %s for ARQ %s."
|
LOG.error("Failed to deallocate attach handle %s for ARQ %s."
|
||||||
@ -252,6 +277,7 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat,
|
|||||||
|
|
||||||
def unbind(self, context):
|
def unbind(self, context):
|
||||||
arq = self.arq
|
arq = self.arq
|
||||||
|
hostname = arq.hostname
|
||||||
arq.hostname = None
|
arq.hostname = None
|
||||||
arq.device_rp_uuid = None
|
arq.device_rp_uuid = None
|
||||||
arq.instance_uuid = None
|
arq.instance_uuid = None
|
||||||
@ -260,7 +286,7 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat,
|
|||||||
# Unbind: mark attach handles as freed
|
# Unbind: mark attach handles as freed
|
||||||
ah_id = self.attach_handle_id
|
ah_id = self.attach_handle_id
|
||||||
if ah_id:
|
if ah_id:
|
||||||
self._deallocate_attach_handle(context, ah_id)
|
self._deallocate_attach_handle(context, ah_id, hostname)
|
||||||
self.attach_handle_id = None
|
self.attach_handle_id = None
|
||||||
self.deployable_id = None
|
self.deployable_id = None
|
||||||
self.save(context)
|
self.save(context)
|
||||||
@ -285,6 +311,7 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat,
|
|||||||
if db_ah is not None:
|
if db_ah is not None:
|
||||||
db_extarq['attach_handle_type'] = db_ah['attach_type']
|
db_extarq['attach_handle_type'] = db_ah['attach_type']
|
||||||
db_extarq['attach_handle_info'] = db_ah['attach_info']
|
db_extarq['attach_handle_info'] = db_ah['attach_info']
|
||||||
|
db_extarq['attach_handle_uuid'] = db_ah['uuid']
|
||||||
else:
|
else:
|
||||||
raise exception.ResourceNotFound(
|
raise exception.ResourceNotFound(
|
||||||
resource='Attach Handle',
|
resource='Attach Handle',
|
||||||
|
@ -352,7 +352,8 @@ class TestExtARQObject(base.DbTestCase):
|
|||||||
self, mock_deallocate, mock_ah, mock_check_state):
|
self, mock_deallocate, mock_ah, mock_check_state):
|
||||||
obj_extarq = self.fake_obj_extarqs[0]
|
obj_extarq = self.fake_obj_extarqs[0]
|
||||||
mock_ah.return_value = self.fake_obj_ahs[0]
|
mock_ah.return_value = self.fake_obj_ahs[0]
|
||||||
obj_extarq._deallocate_attach_handle(self.context, mock_ah.id)
|
obj_extarq._deallocate_attach_handle(
|
||||||
|
self.context, mock_ah.id, obj_extarq.arq.hostname)
|
||||||
mock_check_state.assert_not_called()
|
mock_check_state.assert_not_called()
|
||||||
|
|
||||||
@mock.patch('logging.LoggerAdapter.error')
|
@mock.patch('logging.LoggerAdapter.error')
|
||||||
@ -370,7 +371,8 @@ class TestExtARQObject(base.DbTestCase):
|
|||||||
mock_deallocate.side_effect = e
|
mock_deallocate.side_effect = e
|
||||||
self.assertRaises(
|
self.assertRaises(
|
||||||
exception.ResourceNotFound,
|
exception.ResourceNotFound,
|
||||||
obj_extarq._deallocate_attach_handle, self.context, mock_ah.id)
|
obj_extarq._deallocate_attach_handle, self.context, mock_ah.id,
|
||||||
|
obj_extarq.arq.hostname)
|
||||||
mock_log.assert_called_once_with(
|
mock_log.assert_called_once_with(
|
||||||
msg, mock_ah.id, obj_extarq.arq.uuid, str(e))
|
msg, mock_ah.id, obj_extarq.arq.uuid, str(e))
|
||||||
mock_check_state.assert_called_once_with(
|
mock_check_state.assert_called_once_with(
|
||||||
|
Loading…
Reference in New Issue
Block a user