Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Support nomad #750

Open
wants to merge 14 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions clipper_admin/clipper_admin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from .docker.docker_container_manager import DockerContainerManager
from .kubernetes.kubernetes_container_manager import KubernetesContainerManager
from .nomad.nomad_container_manager import NomadContainerManager
from .nomad.consul_dns import ConsulDNS
from .clipper_admin import *
from . import deployers
from .version import __version__, __registry__
Expand Down
26 changes: 26 additions & 0 deletions clipper_admin/clipper_admin/nomad/consul_dns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from .dns import DNS
import dns.resolver
import socket

"""
Consul is a service networking solution to connect and secure services across any runtime platform and public or private cloud
"""
class ConsulDNS(DNS):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Couldn't find the usage in this PR. Where is it used?

Copy link
Author

@asauray asauray Oct 17, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be used on the client side as follows:

from clipper_admin.deployers import python as python_deployer
from clipper_admin import ClipperConnection, DockerContainerManager, NomadContainerManager, ConsulDNS

nomad_ip_addr = '10.65.30.43'
dns = ConsulDNS() # We use Consul for DNS resolution
container_manager = NomadContainerManager(
    nomad_ip=nomad_ip_addr,
    dns=dns
)
clipper_conn = ClipperConnection(container_manager)
clipper_conn.connect()

I will document this as well


"""
This method resolves records of IP and Ports with a SRV DNS request
Parameters:
domain str:
The domain to resolve, in Consul this correspond to the healthcheck name
"""
def resolveSRV(self, check_name):
addr = '{}.service.consul'.format(check_name)
srv_records= dns.resolver.query(addr, 'SRV')
srvInfo = {}
for srv in srv_records:
srvInfo['host'] = str(srv.target).rstrip('.')
srvInfo['port'] = srv.port
host = srvInfo['host']
port = srvInfo['port']
return (socket.gethostbyname(host), port)

14 changes: 14 additions & 0 deletions clipper_admin/clipper_admin/nomad/dns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import abc
from abc import abstractmethod
class DNS(abc.ABC):

"""
This method resolves records of IP and Ports with a SRV DNS request
Parameters:
domain str:
The domain to resolve
"""
@abstractmethod
def resolveSRV(self, domain):
pass

22 changes: 22 additions & 0 deletions clipper_admin/clipper_admin/nomad/fabio_load_balancer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import abc
from abc import abstractmethod
from load_balancer import LoadBalancer

"""

"""
class FabioLoadBalancer(LoadBalancer):

"""
Parameters
----------

address: str
The address at which the load balancer is located. For instance fabio.service.consul
port: str
The port on which the TCP proxy listens, this is not the http port on which fabio proxy http requests !
https://fabiolb.net/feature/tcp-proxy/
"""
def __init__(self, address, port):
self.address = address
self.port = port
12 changes: 12 additions & 0 deletions clipper_admin/clipper_admin/nomad/load_balancer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import abc
from abc import abstractmethod

class LoadBalancer(abc.ABC):

@abstractmethod
def tcp(self, address):
pass
@abstractmethod
def http(self, address):
pass

74 changes: 74 additions & 0 deletions clipper_admin/clipper_admin/nomad/mgmt_deployment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from .utils import nomad_job_prefix, mgmt_job_prefix, mgmt_check
import os


""" Nomad payload to deploy a new mgmt """
def mgmt_deployment(
job_id,
datacenters,
cluster_name,
image,
redis_ip,
redis_port,
num_replicas,
cpu=500,
memory=256,
health_check_interval=3000000000,
health_check_timeout=2000000000
):
job = {
'Job':
{
'ID': job_id,
'Datacenters': datacenters,
'Type': 'service',
'TaskGroups': [
{
'Name': nomad_job_prefix(cluster_name),
'Count': num_replicas,
'Tasks': [
{
'Name': mgmt_job_prefix(cluster_name),
'Driver': 'docker',
'Config': {
'args': [
"--redis_ip={}".format(redis_ip or os.environ('REDIS_SERVICE_IP')), # If redis_service_host == None, default to env var
"--redis_port={}".format(redis_port or os.environ('REDIS_SERVICE_PORT') or True)
],
'image': image,
'port_map': [
{'http': 1338}
]
},
'Resources': {
'CPU': cpu,
'MemoryMB': memory,
'Networks': [
{
'DynamicPorts': [{'Label': 'http', 'Value': 1338}]
}
]
},
'Services': [
{
'Name': mgmt_check(cluster_name),
'Tags': ['machine-learning', 'model', 'clipper', 'mgmt'],
'PortLabel': 'http',
'Checks': [
{
'Name': 'alive',
'Type': 'tcp',
'interval': health_check_interval,
'timeout': health_check_timeout
}
]
}
]
}
]
}
]

}
}
return job
80 changes: 80 additions & 0 deletions clipper_admin/clipper_admin/nomad/model_deployment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from .utils import nomad_job_prefix, model_job_prefix, generate_model_job_name, model_check_name

""" Nomad payload to deploy a new model """
def model_deployment(
job_id,
datacenters,
cluster_name,
model_name,
model_version,
input_type,
image,
num_replicas,
query_frontend_ip,
query_frontend_port,
cpu=500,
memory=256,
health_check_interval=3000000000,
health_check_timeout=2000000000
):
job = {
'Job': {
'ID': job_id,
'Datacenters': datacenters,
'Type': 'service',
'TaskGroups': [
{
'Name': 'clipper-{}'.format(cluster_name),
'Count': num_replicas,
'Tasks': [
{
'Name': generate_model_job_name(cluster_name, model_name, model_version),
'Driver': 'docker',
'Env': {
'CLIPPER_MODEL_NAME': model_name,
'CLIPPER_MODEL_VERSION': model_version,
'CLIPPER_IP': query_frontend_ip,
'CLIPPER_PORT': query_frontend_port,
'CLIPPER_INPUT_TYPE': input_type
},
'Config': {
'image': image,
'port_map': [
{'zeromq': 1390}
],
'dns_servers': ["${attr.unique.network.ip-address}"]
},
'Resources': {
'CPU': cpu,
'MemoryMB': memory,
'Networks': [
{
'DynamicPorts': [
{'Label': 'zeromq', 'Value': 1390}
]
}
]
},
'Services': [
{
'Name': model_check_name(cluster_name, model_name, model_version),
'Tags': ['machine-learning', 'model', 'clipper', model_name],
'PortLabel': 'zeromq',
'Checks': [
{
'Name': 'alive',
'Type': 'tcp',
'interval': health_check_interval,
'timeout': health_check_timeout
}
]
}
]
}
]
}
]

}
}
return job
Loading