Merged with master

nchammas · Mar 10, 2017 · 39b8372 · 39b8372
2 parents efceb7d + 3d53326
commit 39b8372
Show file tree

Hide file tree

Showing 27 changed files with 401 additions and 114 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -8,6 +8,7 @@ install:
 script:
   - "py.test ./tests/test_static.py"
   - "py.test ./tests/test_flintrock.py"
+  - "py.test ./tests/test_core.py"
   - "pip install -r requirements/maintainer.pip"
   - "py.test ./tests/test_pyinstaller_packaging.py"
 addons:

diff --git a/CHANGES.md b/CHANGES.md
@@ -2,10 +2,44 @@
 
 ## [Unreleased]
 
-[Unreleased]: https://github.com/nchammas/flintrock/compare/v0.7.0...master
+[Unreleased]: https://github.com/nchammas/flintrock/compare/v0.8.0...master
 
 Nothing notable yet.
 
+## [0.8.0] - 2017-02-11
+
+[0.8.0]: https://github.com/nchammas/flintrock/compare/v0.7.0...v0.8.0
+
+### Added
+
+* [#180]: Accessing data on S3 from your Flintrock cluster is now much
+  easier! Just configure Flintrock to use Hadoop 2.7+ (which is the
+  default) and an appropriate IAM role, and you'll be able to access
+  paths on S3 using the new `s3a://` prefix. [Check the README] for
+  more information.
+* [#176], [#187]: Flintrock now supports users with non-standard home
+  directories.
+
+[#180]: https://github.com/nchammas/flintrock/pull/180
+[#176]: https://github.com/nchammas/flintrock/pull/176
+[#187]: https://github.com/nchammas/flintrock/pull/187
+[Check the README]: https://github.com/nchammas/flintrock/tree/v0.8.0#accessing-data-on-s3
+
+### Changed
+
+* [#168]: Flintrock now does a better job of cleaning up after
+  interrupted operations.
+* [#179], [#184]: Flintrock can now clean up malformed Flintrock
+  clusters.
+* [`6b426ae`]: We fixed an issue affecting some users of Flintrock's
+  standalone package that caused Flintrock to intermittently throw
+  `ImportError`s.
+
+[#168]: https://github.com/nchammas/flintrock/pull/168
+[#179]: https://github.com/nchammas/flintrock/pull/179
+[#184]: https://github.com/nchammas/flintrock/pull/184
+[`6b426ae`]: https://github.com/nchammas/flintrock/commit/6b426aedc7e92b434021cc09c6e7eb181fca7eef
+
 ## [0.7.0] - 2016-11-15
 
 [0.7.0]: https://github.com/nchammas/flintrock/compare/v0.6.0...v0.7.0

diff --git a/README.md b/README.md
@@ -18,10 +18,10 @@ Here's a quick way to launch a cluster on EC2, assuming you already have an [AWS
 ```sh
 flintrock launch test-cluster \
     --num-slaves 1 \
-    --spark-version 2.0.2 \
+    --spark-version 2.1.0 \
     --ec2-key-name key_name \
     --ec2-identity-file /path/to/key.pem \
-    --ec2-ami ami-b73b63a0 \
+    --ec2-ami ami-0b33d91d \
     --ec2-user ec2-user
 ```
 
@@ -57,6 +57,30 @@ flintrock <subcommand> --help
 
 That's not all. Flintrock has a few more [features](#features) that you may find interesting.
 
+### Accessing data on S3
+
+We recommend you access data on S3 from your Flintrock cluster by following
+these steps:
+
+1. Setup an [IAM Role](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html)
+   that grants access to S3 as desired. Reference this role when you launch
+   your cluster using the `--ec2-instance-profile-name` option (or its
+   equivalent in your `config.yaml` file).
+2. Reference S3 paths in your Spark code using the `s3a://` prefix. `s3a://` is
+   backwards compatible with `s3n://` and replaces both `s3n://` and `s3://`.
+   The Hadoop project [recommends using `s3a://`](https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html#S3A)
+   since it is actively developed, supports larger files, and offers
+   better performance.
+3. Make sure Flintrock is configured to use Hadoop/HDFS 2.7+. Earlier
+   versions of Hadoop do not have solid implementations of `s3a://`.
+   Flintrock's default is Hadoop 2.7.3, so you don't need to do anything
+   here if you're using a vanilla configuration.
+
+With this approach you don't need to copy around your AWS credentials
+or pass them into your Spark programs. As long as the assigned IAM role
+allows it, Spark will be able to read and write data to S3 simply by
+referencing the appropriate path (e.g. `s3a://bucket/path/to/file`).
+
 
 ## Installation
 
@@ -101,7 +125,7 @@ unzip it to a location of your choice, and run the `flintrock` executable inside
 For example:
 
 ```sh
-flintrock_version="0.7.0"
+flintrock_version="0.8.0"
 
 curl --location --remote-name "https://github.com/nchammas/flintrock/releases/download/v$flintrock_version/Flintrock-$flintrock_version-standalone-OSX-x86_64.zip"
 unzip -q -d flintrock "Flintrock-$flintrock_version-standalone-OSX-x86_64.zip"
@@ -196,7 +220,7 @@ provider: ec2
 
 services:
   spark:
-    version: 2.0.2
+    version: 2.1.0
 
 launch:
   num-slaves: 1
@@ -207,7 +231,7 @@ providers:
     identity-file: /path/to/.ssh/key.pem
     instance-type: m3.medium
     region: us-east-1
-    ami: ami-b73b63a0
+    ami: ami-0b33d91d
     user: ec2-user
 ```
 

diff --git a/flintrock/__init__.py b/flintrock/__init__.py
@@ -1,2 +1,2 @@
 # See: https://packaging.python.org/en/latest/distributing/#standards-compliance-for-interoperability
-__version__ = '0.8.0.dev0'
+__version__ = '0.9.0.dev0'
diff --git a/flintrock/config.yaml.template b/flintrock/config.yaml.template
@@ -1,6 +1,6 @@
 services:
   spark:
-    version: 2.0.2
+    version: 2.1.0
     # git-commit: latest  # if not 'latest', provide a full commit SHA; e.g. d6dc12ef0146ae409834c78737c116050961f350
     # git-repository:  # optional; defaults to https://github.com/apache/spark
     # optional; defaults to download from from the official Spark S3 bucket
@@ -24,7 +24,7 @@ providers:
     instance-type: m3.medium
     region: us-east-1
     # availability-zone: <name>
-    ami: ami-b73b63a0   # Amazon Linux, us-east-1
+    ami: ami-0b33d91d   # Amazon Linux, us-east-1
     user: ec2-user
     # ami: ami-61bbf104   # CentOS 7, us-east-1
     # user: centos
@@ -36,6 +36,10 @@ providers:
     #   - group-name1
     #   - group-name2
     # instance-profile-name:
+    # tags:
+    #   - key1,value1
+    #   - key2, value2  # leading/trailing spaces are trimmed
+    #   - key3,  # value will be empty
     tenancy: default  # default | dedicated
     ebs-optimized: no  # yes | no
     instance-initiated-shutdown-behavior: terminate  # terminate | stop

diff --git a/flintrock/core.py b/flintrock/core.py
@@ -144,8 +144,8 @@ def load_manifest(self, *, user: str, identity_file: str):
             manifest_raw = ssh_check_output(
                 client=master_ssh_client,
                 command="""
-                    cat /home/{u}/.flintrock-manifest.json
-                """.format(u=shlex.quote(user)))
+                    cat "$HOME/.flintrock-manifest.json"
+                """)
             # TODO: Would it be better if storage (ephemeral and otherwise) was
             #       implemented as a Flintrock service and tracked in the manifest?
             ephemeral_dirs_raw = ssh_check_output(
@@ -427,30 +427,65 @@ def login(
             user=user,
             identity_file=identity_file)
 
-    def generate_template_mapping(self, *, service: str) -> dict:
-        """
-        Generate a template mapping from a FlintrockCluster instance that we can use
-        to fill in template parameters.
-        """
-        root_dir = posixpath.join(self.storage_dirs.root, service)
-        ephemeral_dirs = ','.join(posixpath.join(path, service) for path in self.storage_dirs.ephemeral)
-
-        template_mapping = {
-            'master_ip': self.master_ip,
-            'master_host': self.master_host,
-            'slave_ips': '\n'.join(self.slave_ips),
-            'slave_hosts': '\n'.join(self.slave_hosts),
-            'root_dir': root_dir,
-            'ephemeral_dirs': ephemeral_dirs,
 
-            # If ephemeral storage is available, it replaces the root volume, which is
-            # typically persistent. We don't want to mix persistent and ephemeral
-            # storage since that causes problems after cluster stop/start; some volumes
-            # have leftover data, whereas others start fresh.
-            'root_ephemeral_dirs': ephemeral_dirs if ephemeral_dirs else root_dir,
-        }
-
-        return template_mapping
+def generate_template_mapping(
+    *,
+    cluster: FlintrockCluster,
+    # If we add additional services later on we may want to refactor
+    # this to take a list of services and dynamically pull the service
+    # name.
+    hadoop_version: str,
+    spark_version: str
+) -> dict:
+    """
+    Generate a template mapping from a FlintrockCluster instance that we can use
+    to fill in template parameters.
+    """
+    hadoop_root_dir = posixpath.join(cluster.storage_dirs.root, 'hadoop')
+    hadoop_ephemeral_dirs = ','.join(
+        posixpath.join(path, 'hadoop')
+        for path in cluster.storage_dirs.ephemeral
+    )
+    spark_root_dir = posixpath.join(cluster.storage_dirs.root, 'spark')
+    spark_ephemeral_dirs = ','.join(
+        posixpath.join(path, 'spark')
+        for path in cluster.storage_dirs.ephemeral
+    )
+
+    template_mapping = {
+        'master_ip': cluster.master_ip,
+        'master_host': cluster.master_host,
+        'slave_ips': '\n'.join(cluster.slave_ips),
+        'slave_hosts': '\n'.join(cluster.slave_hosts),
+
+        'hadoop_version': hadoop_version,
+        'hadoop_short_version': '.'.join(hadoop_version.split('.')[:2]),
+        'spark_version': spark_version,
+        'spark_short_version': '.'.join(spark_version.split('.')[:2]),
+
+        'hadoop_root_dir': hadoop_root_dir,
+        'hadoop_ephemeral_dirs': hadoop_ephemeral_dirs,
+        'spark_root_dir': spark_root_dir,
+        'spark_ephemeral_dirs': spark_ephemeral_dirs,
+
+        # If ephemeral storage is available, it replaces the root volume, which is
+        # typically persistent. We don't want to mix persistent and ephemeral
+        # storage since that causes problems after cluster stop/start; some volumes
+        # have leftover data, whereas others start fresh.
+        'hadoop_root_ephemeral_dirs': hadoop_ephemeral_dirs if hadoop_ephemeral_dirs else hadoop_root_dir,
+        'spark_root_ephemeral_dirs': spark_ephemeral_dirs if spark_ephemeral_dirs else spark_root_dir,
+    }
+
+    return template_mapping
+
+
+# TODO: Cache these files. (?) They are being read potentially tens or
+#       hundreds of times. Maybe it doesn't matter because the files
+#       are so small.
+def get_formatted_template(*, path: str, mapping: dict) -> str:
+    with open(path) as f:
+        formatted = f.read().format(**mapping)
+    return formatted
 
 
 def run_against_hosts(*, partial_func: functools.partial, hosts: list):
@@ -539,10 +574,10 @@ def setup_node(
         command="""
             set -e
 
-            echo {private_key} > ~/.ssh/id_rsa
-            echo {public_key} >> ~/.ssh/authorized_keys
+            echo {private_key} > "$HOME/.ssh/id_rsa"
+            echo {public_key} >> "$HOME/.ssh/authorized_keys"
 
-            chmod 400 ~/.ssh/id_rsa
+            chmod 400 "$HOME/.ssh/id_rsa"
         """.format(
             private_key=shlex.quote(cluster.ssh_key_pair.private),
             public_key=shlex.quote(cluster.ssh_key_pair.public)))
@@ -613,11 +648,11 @@ def provision_cluster(
         ssh_check_output(
             client=master_ssh_client,
             command="""
-                echo {m} > /home/{u}/.flintrock-manifest.json
-                chmod go-rw /home/{u}/.flintrock-manifest.json
+                echo {m} > "$HOME/.flintrock-manifest.json"
+                chmod go-rw "$HOME/.flintrock-manifest.json"
             """.format(
-                m=shlex.quote(json.dumps(manifest, indent=4, sort_keys=True)),
-                u=shlex.quote(user)))
+                m=shlex.quote(json.dumps(manifest, indent=4, sort_keys=True))
+            ))
 
         for service in services:
             service.configure_master(
@@ -830,4 +865,4 @@ def copy_file_node(
 # core.py and services.py. I've thought about how to remove this circular dependency,
 # but for now this seems like what we need to go with.
 # Flintrock modules
-from .services import HDFS, Spark  # Used by start_cluster()
+from .services import HDFS, Spark  # Used by start_cluster() # noqa