diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index f4ac4d577e3..810e43fe3b5 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -17,6 +17,8 @@ make_deploy_resources_variables(): Bug fix for specify the image_id as the ocid of the image in the task.yaml file, in this case the image_id for the node config should be set to the ocid instead of a dict. + - Hysun He (hysun.he@oracle.com) @ Oct 13, 2024: + Support more OS types additional to ubuntu for OCI resources. """ import json import logging @@ -295,10 +297,21 @@ def make_deploy_resources_variables( cpus=None if cpus is None else float(cpus), disk_tier=resources.disk_tier) + image_str = self._get_image_str(image_id=resources.image_id, + instance_type=resources.instance_type, + region=region.name) + + # pylint: disable=import-outside-toplevel + from sky.clouds.service_catalog import oci_catalog + os_type = oci_catalog.get_image_os_from_tag(tag=image_str, + region=region.name) + logger.debug(f'OS type for the image {image_str} is {os_type}') + return { 'instance_type': instance_type, 'custom_resources': custom_resources, 'region': region.name, + 'os_type': os_type, 'cpus': str(cpus), 'memory': resources.memory, 'disk_size': resources.disk_size, @@ -501,59 +514,45 @@ def _get_image_id( region_name: str, instance_type: str, ) -> str: - if image_id is None: - return self._get_default_image(region_name=region_name, - instance_type=instance_type) - if None in image_id: - image_id_str = image_id[None] - else: - assert region_name in image_id, image_id - image_id_str = image_id[region_name] + image_id_str = self._get_image_str(image_id=image_id, + instance_type=instance_type, + region=region_name) + if image_id_str.startswith('skypilot:'): image_id_str = service_catalog.get_image_id_from_tag(image_id_str, region_name, clouds='oci') - if image_id_str is None: - logger.critical( - '! Real image_id not found! - {region_name}:{image_id}') - # Raise ResourcesUnavailableError to make sure the failover - # in CloudVMRayBackend will be correctly triggered. - # TODO(zhwu): This is a information leakage to the cloud - # implementor, we need to find a better way to handle this. - raise exceptions.ResourcesUnavailableError( - '! ERR: No image found in catalog for region ' - f'{region_name}. Try setting a valid image_id.') + + # Image_id should be impossible be None, except for the case when + # user specify an image tag which does not exist in the image.csv + # catalog file which only possible in "test" / "evaluation" phase. + # Therefore, we use assert here. + assert image_id_str is not None logger.debug(f'Got real image_id {image_id_str}') return image_id_str - def _get_default_image(self, region_name: str, instance_type: str) -> str: + def _get_image_str(self, image_id: Optional[Dict[Optional[str], str]], + instance_type: str, region: str): + if image_id is None: + image_str = self._get_default_image_tag(instance_type) + elif None in image_id: + image_str = image_id[None] + else: + assert region in image_id, image_id + image_str = image_id[region] + return image_str + + def _get_default_image_tag(self, instance_type: str) -> str: acc = self.get_accelerators_from_instance_type(instance_type) if acc is None: image_tag = oci_utils.oci_config.get_default_image_tag() - image_id_str = service_catalog.get_image_id_from_tag(image_tag, - region_name, - clouds='oci') else: assert len(acc) == 1, acc image_tag = oci_utils.oci_config.get_default_gpu_image_tag() - image_id_str = service_catalog.get_image_id_from_tag(image_tag, - region_name, - clouds='oci') - if image_id_str is not None: - logger.debug( - f'Got default image_id {image_id_str} from tag {image_tag}') - return image_id_str - - # Raise ResourcesUnavailableError to make sure the failover in - # CloudVMRayBackend will be correctly triggered. - # TODO(zhwu): This is a information leakage to the cloud implementor, - # we need to find a better way to handle this. - raise exceptions.ResourcesUnavailableError( - 'ERR: No image found in catalog for region ' - f'{region_name}. Try update your default image_id settings.') + return image_tag def get_vpu_from_disktier( self, cpus: Optional[float], diff --git a/sky/clouds/service_catalog/oci_catalog.py b/sky/clouds/service_catalog/oci_catalog.py index a18dee79be5..47d0489f6ab 100644 --- a/sky/clouds/service_catalog/oci_catalog.py +++ b/sky/clouds/service_catalog/oci_catalog.py @@ -7,6 +7,8 @@ - Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation - Hysun He (hysun.he@oracle.com) @ Jun, 2023: Reduce retry times by excluding those unsubscribed regions. + - Hysun He (hysun.he@oracle.com) @ Oct 14, 2024: Bug fix for validation + of the Marketplace images """ import logging @@ -206,4 +208,24 @@ def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]: def is_image_tag_valid(tag: str, region: Optional[str]) -> bool: """Returns whether the image tag is valid.""" + # Oct.14, 2024 by Hysun He: Marketplace images are region neutral, so don't + # check with region for the Marketplace images. + df = _image_df[_image_df['Tag'].str.fullmatch(tag)] + if df.empty: + return False + app_catalog_listing_id = df['AppCatalogListingId'].iloc[0] + if app_catalog_listing_id: + return True return common.is_image_tag_valid_impl(_image_df, tag, region) + + +def get_image_os_from_tag(tag: str, region: Optional[str]) -> Optional[str]: + del region + df = _image_df[_image_df['Tag'].str.fullmatch(tag)] + if df.empty: + os_type = oci_utils.oci_config.get_default_image_os() + else: + os_type = df['OS'].iloc[0] + + logger.debug(f'Operation system for the image {tag} is {os_type}') + return os_type diff --git a/sky/clouds/utils/oci_utils.py b/sky/clouds/utils/oci_utils.py index 3d11bab24da..86647071f3e 100644 --- a/sky/clouds/utils/oci_utils.py +++ b/sky/clouds/utils/oci_utils.py @@ -1,7 +1,9 @@ """OCI Configuration. History: - - Zhanghao Wu @ Oct 2023: Formatting and refactoring - Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation + - Zhanghao Wu @ Oct 2023: Formatting and refactoring + - Hysun He (hysun.he@oracle.com) @ Oct, 2024: Add default image OS + configuration. """ import logging import os @@ -121,5 +123,13 @@ def get_profile(cls) -> str: return skypilot_config.get_nested( ('oci', 'default', 'oci_config_profile'), 'DEFAULT') + @classmethod + def get_default_image_os(cls) -> str: + # Get the default image OS. Instead of hardcoding, we give a choice to + # set the default image OS type in the sky's user-config file. (if not + # specified, use the hardcode one at last) + return skypilot_config.get_nested(('oci', 'default', 'image_os_type'), + 'ubuntu') + oci_config = OCIConfig() diff --git a/sky/templates/oci-ray.yml.j2 b/sky/templates/oci-ray.yml.j2 index 32bd6326ee2..64fa4e745c7 100644 --- a/sky/templates/oci-ray.yml.j2 +++ b/sky/templates/oci-ray.yml.j2 @@ -16,7 +16,11 @@ provider: disable_launch_config_check: true auth: +{% if os_type == "ubuntu" %} ssh_user: ubuntu +{% else %} + ssh_user: opc +{% endif %} ssh_private_key: {{ssh_private_key}} available_node_types: @@ -85,14 +89,20 @@ setup_commands: # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - sudo systemctl stop unattended-upgrades || true; + - echo "setup commands runs at $(date)" > /tmp/provision.tmp.out || true; + {%- if os_type == "ubuntu" %} + sudo systemctl stop unattended-upgrades || true; sudo systemctl disable unattended-upgrades || true; sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true; sudo pkill -9 apt-get; sudo pkill -9 dpkg; sudo dpkg --configure -a; - ([ `sudo lshw -class display | grep "NVIDIA Corporation" | wc -l` -gt 0 ]) && (sudo which nvidia-smi > /dev/null || ( sudo apt-get install nvidia-driver-530-open -y && sudo apt-get install nvidia-driver-525-server -y ) || true); + {%- else %} + sudo /usr/libexec/oci-growfs -y || true; + sudo systemctl stop firewalld || true; + sudo systemctl disable firewalld || true; + {%- endif %} mkdir -p ~/.ssh; touch ~/.ssh/config; {{ conda_installation_commands }} {{ ray_skypilot_installation_commands }}