diff --git a/ingestion/source-connectors/google-drive.mdx b/ingestion/source-connectors/google-drive.mdx index b0cfd327..b72a84a2 100644 --- a/ingestion/source-connectors/google-drive.mdx +++ b/ingestion/source-connectors/google-drive.mdx @@ -24,8 +24,4 @@ import GoogleDrivePyV2 from '/snippets/source_connectors/google_drive.v2.py.mdx' import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx'; - - -import GoogleCredentialsFileAsString from '/snippets/general-shared-text/google-credentials-file-as-string.mdx'; - - \ No newline at end of file + \ No newline at end of file diff --git a/snippets/general-shared-text/google-credentials-file-as-string.mdx b/snippets/general-shared-text/google-credentials-file-as-string.mdx deleted file mode 100644 index e74f963c..00000000 --- a/snippets/general-shared-text/google-credentials-file-as-string.mdx +++ /dev/null @@ -1,44 +0,0 @@ -## Output a key file's contents as a string - -If you need to convert the contents of a `credentials.json` file into a string, you could use a Python script such as the following. -This script takes the local path to the key file as input and outputs the key file's contents as a string. - -```python Python -# Filename: json_file_to_string.py - -import json -import sys - -def json_file_to_string(file_path): - try: - # Read the JSON file. - with open(file_path, 'r') as file: - # Load the JSON data. - data = json.load(file) - - # Convert the JSON data back to a string, with no whitespace. - json_string = json.dumps(data, separators=(',', ':')) - - return json_string - - except FileNotFoundError: - print(f"Error: File '{file_path}' not found.") - return None - except json.JSONDecodeError: - print(f"Error: '{file_path}' is not a valid JSON file.") - return None - except Exception as e: - print(f"An unexpected error occurred: {e}") - return None - -if __name__ == "__main__": - if len(sys.argv) != 2: - print("Usage: python json_file_to_string.py ") - sys.exit(1) - - file_path = sys.argv[1] - result = json_file_to_string(file_path) - - if result: - print(result) -``` \ No newline at end of file diff --git a/snippets/general-shared-text/google-drive-api-placeholders.mdx b/snippets/general-shared-text/google-drive-api-placeholders.mdx index 548b478e..9807c644 100644 --- a/snippets/general-shared-text/google-drive-api-placeholders.mdx +++ b/snippets/general-shared-text/google-drive-api-placeholders.mdx @@ -1,6 +1,6 @@ - `` (_required_) - A unique name for this connector. - `` - The ID for the target Google Drive folder or drive. -- `` - The contents of the `credentials.json` key file as a single-line string. +- `` - The contents of the `credentials.json` key file. - For `extensions`, set one or more `` values (such as `pdf` or `docx`) to process files with only those extensions. The default is to include all extensions. diff --git a/snippets/general-shared-text/google-drive-cli-api.mdx b/snippets/general-shared-text/google-drive-cli-api.mdx index 497e8591..ff561132 100644 --- a/snippets/general-shared-text/google-drive-cli-api.mdx +++ b/snippets/general-shared-text/google-drive-cli-api.mdx @@ -14,7 +14,7 @@ The following environment variables: - One of the following: - `GCP_SERVICE_ACCOUNT_KEY_FILEPATH` - The path to the `credentials.json` key file, represented by `--service-account-key-path` (CLI) or `service_account_key_path` (Python). - - `GCP_SERVICE_ACCOUNT_KEY_STRING` - The contents of the `credentials.json` key file as a string, represented by `--service-account-key` (CLI) or `service_account_key` (Python). + - `GCP_SERVICE_ACCOUNT_KEY_STRING` - The contents of the `credentials.json` key file, represented by `--service-account-key` (CLI) or `service_account_key` (Python). To use `--extensions` with a comma-separated list (CLI) or `extensions` with an array of strings (Python) to process files with only those extensions, diff --git a/snippets/general-shared-text/google-drive.mdx b/snippets/general-shared-text/google-drive.mdx index 00e47ae7..acfa9fa1 100644 --- a/snippets/general-shared-text/google-drive.mdx +++ b/snippets/general-shared-text/google-drive.mdx @@ -15,23 +15,52 @@ allowfullscreen [Create a service account](https://developers.google.com/workspace/guides/create-credentials#create_a_service_account). [Create credentials for a service account](https://developers.google.com/workspace/guides/create-credentials#create_credentials_for_a_service_account). - To ensure maximum compatibility across Unstructured service offerings, you should give the service account key information to Unstructured as - a single-line string that contains the contents of the downloaded service account key file (and not the service account key file itself). - To print this single-line string without line breaks, suitable for copying, you can run one of the following commands from your Terminal or Command Prompt. - In this command, replace `` with the path to the `credentials.json` key file that you downloaded by following the preceding instructions. + To ensure maximum compatibility across Unstructured service offerings, you should give Unstructured + the contents of the downloaded service account key file (and not the service account key file itself). The approach you use + to give Unstructured this information depends on how you intend to call Unstructured, as well as the operating system you're using, as follows. - - For macOS or Linux: + - For the [Unstructured user interface (UI)](/ui/overview): - ```text + Print the contents of the downloaded service account key file as a single-line string without line breaks, suitable for copying, by running one of the following commands from your Terminal or Command Prompt. + In this command, replace `` with the path to the `credentials.json` key file that you downloaded by following the preceding instructions. + + For macOS or Linux: + + ```bash Bash tr -d '\n' < ``` - - For Windows: + For Windows: - ```text + ```powershell PowerShell (Get-Content -Path "" -Raw).Replace("`r`n", "").Replace("`n", "") ``` + Copy the output of this command into the **Account Key** field in the Unstructured UI. + + - For the [Unstructured API](/api-reference/overview) and [Unstructured Ingest](/ingestion/overview): + + Save the contents of the downloaded service account key file as a Base64-encoded string, by running one of the following commands from your Terminal or Command Prompt. + In this command, replace `` with the path to the `credentials.json` key file that you downloaded by following the preceding instructions. + + For macOS or Linux: + + ```bash Bash + base64 -i < + ``` + + For Windows: + + ```powershell PowerShell + [Convert]::ToBase64String([IO.File]::ReadAllBytes("")) + ``` + + Set the value of the `GCP_SERVICE_ACCOUNT_KEY_STRING` environment variable to the output of this command. + + Then, in your code or script, before making your Unstructured API or Unstructured Ingest request, + decode the value of this environment variable and add it to the request. In some cases, + you must also escape all double quotes in the decoded string. For more information, see the following code examples. + - A Google Drive [shared folder](https://support.google.com/drive/answer/2375091) or [shared drive](https://support.google.com/a/users/answer/7212025). - Give the service account access to the shared folder or shared drive. To do this, share the folder or drive with the service account's email address. [Learn how](https://support.google.com/drive/answer/7166529). diff --git a/snippets/source_connectors/google_drive.sh.mdx b/snippets/source_connectors/google_drive.sh.mdx index 6d1da5a4..36e2a1ea 100644 --- a/snippets/source_connectors/google_drive.sh.mdx +++ b/snippets/source_connectors/google_drive.sh.mdx @@ -3,19 +3,24 @@ # Chunking and embedding are optional. +service_account_key=$(python3 -c "import base64, json, os; \ +decoded = base64.b64decode(os.environ['GCP_SERVICE_ACCOUNT_KEY_STRING']).decode('utf-8'); \ +parsed = json.loads(decoded); \ +print(parsed)" | sed "s/'/\"/g") + unstructured-ingest \ google-drive \ - --download-dir $LOCAL_FILE_DOWNLOAD_DIR \ - --drive-id $GOOGLE_DRIVE_FOLDER_ID \ + --download-dir "$LOCAL_FILE_DOWNLOAD_DIR" \ --service-account-key-path $GCP_SERVICE_ACCOUNT_KEY_FILEPATH \ # Or - --service-account-key $GCP_SERVICE_ACCOUNT_KEY_STRING \ + --service-account-key "$service_account_key" \ + --drive-id "$GOOGLE_DRIVE_FOLDER_ID" \ --partition-by-api \ - --api-key $UNSTRUCTURED_API_KEY \ - --partition-endpoint $UNSTRUCTURED_API_URL \ + --api-key "$UNSTRUCTURED_API_KEY" \ + --partition-endpoint "$UNSTRUCTURED_API_URL" \ --strategy hi_res \ --chunking-strategy by_title \ --embedding-provider huggingface \ --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ local \ - --output-dir $LOCAL_FILE_OUTPUT_DIR + --output-dir "$LOCAL_FILE_OUTPUT_DIR" ``` diff --git a/snippets/source_connectors/google_drive.v2.py.mdx b/snippets/source_connectors/google_drive.v2.py.mdx index b198053a..acfc42fd 100644 --- a/snippets/source_connectors/google_drive.v2.py.mdx +++ b/snippets/source_connectors/google_drive.v2.py.mdx @@ -14,6 +14,8 @@ from unstructured_ingest.processes.chunker import ChunkerConfig from unstructured_ingest.processes.embedder import EmbedderConfig from unstructured_ingest.processes.connectors.local import LocalUploaderConfig +import base64 + # Chunking and embedding are optional. if __name__ == "__main__": @@ -24,7 +26,7 @@ if __name__ == "__main__": source_connection_config=GoogleDriveConnectionConfig( access_config=GoogleDriveAccessConfig( service_account_key_path=os.getenv("GCP_SERVICE_ACCOUNT_KEY_FILEPATH"), # Or - service_account_key=os.getenv("GCP_SERVICE_ACCOUNT_KEY_STRING") + service_account_key=base64.b64decode(s=os.getenv("GCP_SERVICE_ACCOUNT_KEY_STRING")).decode(encoding="utf-8") ), drive_id=os.getenv("GOOGLE_DRIVE_FOLDER_ID"), ), diff --git a/snippets/source_connectors/google_drive_sdk.mdx b/snippets/source_connectors/google_drive_sdk.mdx index 136391d1..68c11bce 100644 --- a/snippets/source_connectors/google_drive_sdk.mdx +++ b/snippets/source_connectors/google_drive_sdk.mdx @@ -9,6 +9,8 @@ from unstructured_client.models.shared import ( GoogleDriveSourceConnectorConfigInput ) +import base64 + with UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY")) as client: response = client.sources.create_source( request=CreateSourceRequest( @@ -17,7 +19,7 @@ with UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY")) as clien type=SourceConnectorType.GOOGLE_DRIVE, config=GoogleDriveSourceConnectorConfigInput( drive_id="", - service_account_key="", + service_account_key=base64.b64decode(s=os.getenv("GCP_SERVICE_ACCOUNT_KEY_STRING")).decode(encoding="utf-8").replace('"', '\\"'), extensions=[ "", ""