diff --git a/docs.json b/docs.json index 8df7a3ff..5ef53557 100644 --- a/docs.json +++ b/docs.json @@ -12,13 +12,8 @@ "tabs": [ { "tab": "Welcome", - "groups": [ - { - "group": "", - "pages": [ - "welcome" - ] - } + "pages": [ + "welcome" ] }, { @@ -462,6 +457,44 @@ } ] }, + { + "tab": "Self-hosted", + "groups": [ + { + "group": "Self-hosting", + "pages": [ + "self-hosted/overview" + ] + }, + { + "group": "Security and compliance", + "pages": [ + "self-hosted/security-compliance/overview" + ] + }, + { + "group": "AWS", + "pages": [ + "self-hosted/aws/overview", + "self-hosted/aws/onboard" + ] + }, + { + "group": "Azure", + "pages": [ + "self-hosted/azure/overview", + "self-hosted/azure/onboard" + ] + }, + { + "group": "GCP", + "pages": [ + "self-hosted/gcp/overview", + "self-hosted/gcp/onboard" + ] + } + ] + }, { "tab": "FAQ", "groups": [ @@ -637,6 +670,10 @@ "source": "/api-reference/ingest/:slug*", "destination": "/ingestion/:slug*" }, + { + "source": "/api-reference/troubleshooting/api-key-url", + "destination": "/api-reference/troubleshooting/api-key-url" + }, { "source": "/glossary/glossary", "destination": "/welcome" diff --git a/self-hosted/aws/onboard.mdx b/self-hosted/aws/onboard.mdx new file mode 100644 index 00000000..4c8a7dc4 --- /dev/null +++ b/self-hosted/aws/onboard.mdx @@ -0,0 +1,328 @@ +--- +title: AWS self-hosted onboarding checklist +sidebarTitle: Onboarding +--- + +<Note> + To proceed with a self-hosted deployment, your organization must first sign a self-hosting agreement with Unstructured. + + If you have not yet signed this agreement, stop here, and begin the self-hosting agreement process by contacting your Unstructured sales representative, emailing + Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or filling out the [contact form](https://unstructured.io/contact) + on the Unstructured website. +</Note> + +After your organization has signed the self-hosting agreement with Unstructured, a member of the Unstructured technical enablement team will reach out to you to begin the +deployment onboarding process. To streamline this process, you are encouraged to begin setting up your target environment as soon as possible. To do this, you +must first set up your AWS account as follows. + +## Questions? Need help? + +If you have questions or need help as you go, contact your Unstructured sales representative or technical enablement contact. If you do not know who they are, +email Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or fill out the +[contact form](https://unstructured.io/contact) on the Unstructured website, and a member of the Unstructured sales or technical enablement teams +will get back to you as soon as possible. + +## Onboarding checklist + +Set up the following infrastructure within your AWS account for Unstructured to deploy the Unstructured UI and API into. + +### VPC and networking + +- **VPC** + + - CIDR: `10.0.0.0/16` - Any CIDR should work, but make sure it has enough space. + - DNS Hostnames: Enabled + - DNS Support: Enabled + +- **Internet Gateway** + + - Attached to the VPC + +- **Public Subnet** + + - CIDR: `10.0.0.0/24` + - Public IP on launch: true + - Availability Zone: `${region}a` + +- **NAT Gateway + Elastic IP** + + - Lives in the public subnet + +- **Private Subnets (x2)** + + - CIDRs: `10.0.1.0/24`, `10.0.2.0/24` + - AZs: `${region}a` and `${region}b` + +- **Route Tables** + + - Public: default route (`0.0.0.0/0`) via IGW + - Private (x2): default route via NAT Gateway + +### **IAM roles and policies** + +- **EKS Cluster Role** + + - Trusts: `eks.amazonaws.com` + - Attached policies: + + - `AmazonEKSClusterPolicy` + - `AmazonEKSVPCResourceController` + +- **EKS Node Group Role** + + - Trusts: `ec2.amazonaws.com`, `eks.amazonaws.com` + - Attached policies: + + - `AmazonEKSWorkerNodePolicy` + - `AmazonEKS_CNI_Policy` + - `AmazonEC2ContainerRegistryReadOnly` + +- **OIDC Service Account IAM Roles (x3)** + + - Namespaces: `recommender`, `etl-operator`, `data-broker` + - Each role assumes via `sts:AssumeRoleWithWebIdentity` with OIDC provider + - Each has an S3 policy allowing access to specific buckets (see below) + +### **EKS cluster** + +- **EKS Control Plane** + + - Version: `1.31` or greater + - Subnet: Private subnets only + - Public endpoint access: Enabled + - Private endpoint access: Disabled + +- **Node Group** + + - Instance type: `c5.4xlarge` (or larger, depending on cost factors) + - Disk size: 100 GB + - Desired size: 2 (min 2, max 5) + - Remote SSH access: Enabled (with generated SSH key) + - SSH key: Key pair created and exported + +- **Security Groups** + + - EKS Cluster SG (implicitly created by AWS) + - Node SG: Allows all traffic within cluster CIDR (`10.0.0.0/16`), self, and metadata IP + - Egress: Allows all + +#### **Kubernetes add-ons** + +Installed via `aws.eks.Addon`: + +- **EKS Pod Identity Agent** + + - Version: `v1.3.4-eksbuild.1` + +- **Metrics Server** + + - Version: `v0.7.2-eksbuild.1` + +- **EBS CSI Driver** + + - Version: `v1.38.1-eksbuild.2` + - Configured with: + + - Service account annotation: `eks.amazonaws.com/role-arn` + - Pod identity access annotation + +#### **Storage class** + +- Name: `ebs-sc` +- Default: Yes +- Provisioner: `ebs.csi.aws.com` +- Parameters: `type=gp3`, `encrypted=true` +- Volume Binding Mode: `WaitForFirstConsumer` + +### **RDS** + +- **RDS Subnet Group** + + - Uses the private subnets + +- **RDS Instance** + + - Engine: Postgres 16 + - Size: `db.t3.micro` + - Allocated storage: 20 GB + - Auth: Setup a Username and Password, keep secure. + - Security group: Allows all traffic from `10.0.0.0/16` (keep in mind your CIDR group from the VPC) + - DB name: `postgres` + +### **S3 buckets** + +- `u10d-{stack_name}-etl-blob-cache` +- `u10d-{stack_name}-etl-job-db` +- `u10d-{stack_name}-etl-job-status` +- `u10d-{stack_name}-job-files` + +All created with: + +- Versioning enabled +- Server-side encryption (AES256) +- Force destroy: true + +### **Keys** + +- **SSH Key Pair** (RSA 4096-bit) + + - Key exported as `private_key` (PEM) + +### Secrets and ConfigMaps + +After your infrastructure is set up, but before Unstructured can deploy the Unstructured UI and API into your insfrastructure, +Unstructured will need to know the values of the following Secrets and ConfigMaps. These must be provided to Unstructured as a +set of YAML files in Kubernetes [Secret](https://kubernetes.io/docs/concepts/configuration/secret/) and +[ConfigMap](https://kubernetes.io/docs/concepts/configuration/configmap/) format. + +The Secrets are as follows. + +#### **Blob storage credentials** + +- `BLOB_STORAGE_ADAPTER_ACCESS_KEY_ID` +- `BLOB_STORAGE_ADAPTER_SECRET_ACCESS_KEY` +- `BLOB_STORAGE_ADAPTER_REGION_NAME` + +#### **Database credentials** + +- `DB_USERNAME` +- `DB_PASSWORD` +- `DB_HOST` +- `DB_NAME` +- `DB_DATABASE` (used in `platform-api` only) + +#### **Authentication** + +- `JWT_SECRET_KEY` +- `AUTH_STRATEGY` (sometimes encoded, sometimes not) +- `SESSION_SECRET` +- `SHARED_SECRET` +- `KEYCLOAK_CLIENT_SECRET` +- `KEYCLOAK_ADMIN_SECRET` +- `KEYCLOAK_ADMIN` +- `KEYCLOAK_ADMIN_PASSWORD` +- `API_BEARER_TOKEN` + +The ConfigMaps are as follows. + +#### **Blob storage settings** + +- `BLOB_STORAGE_ADAPTER_TYPE` (always `s3` for AWS) +- `BLOB_STORAGE_ADAPTER_BUCKET` +- `ETL_BLOB_CACHE_BUCKET_NAME` +- `ETL_API_BLOB_STORAGE_ADAPTER_BUCKET` +- `ETL_API_BLOB_STORAGE_ADAPTER_TYPE` +- `ETL_API_DB_REMOTE_BUCKET_NAME` +- `ETL_API_JOB_STATUS_DEST_BUCKET_NAME` +- `JOB_STATUS_BUCKET_NAME` +- `JOB_DB_BUCKET_NAME` + +#### **Environment** + +- `ENV` +- `ENVIRONMENT` +- `JOB_ENV` +- `JOB_ENVIRONMENT` + +#### **Observability and OpenTelemetry (OTel)** + +- `JOB_OTEL_EXPORTER_OTLP_ENDPOINT` +- `JOB_OTEL_METRICS_EXPORTER` +- `JOB_OTEL_TRACES_EXPORTER` +- `OTEL_EXPORTER_OTLP_ENDPOINT` +- `OTEL_METRICS_EXPORTER` +- `OTEL_TRACES_EXPORTER` + +#### **Unstructured API and authentication** + +- `UNSTRUCTURED_API_URL` +- `JWKS_URL` +- `JWT_ISSUER` +- `JWT_AUDIENCE` +- `SINGLE_PLANE_DEPLOYMENT` + +#### **Front end and dashboard** + +- `API_BASE_URL` +- `API_CLIENT_BASE_URL` +- `API_URL` +- `APM_SERVICE_NAME` +- `APM_SERVICE_NAME_CLIENT` +- `AUTH_STRATEGY` +- `FRONTEND_BASE_URL` +- `KEYCLOAK_CALLBACK_URL` +- `KEYCLOAK_CLIENT_ID` +- `KEYCLOAK_DOMAIN` +- `KEYCLOAK_REALM` +- `KEYCLOAK_SSL_ENABLED` +- `KEYCLOAK_TRUST_ISSUER` +- `PUBLIC_BASE_URL` +- `PUBLIC_RELEASE_CHANNEL` + +#### **Sentry and feature flags** + +- `SENTRY_DSN` +- `SENTRY_SAMPLE_RATE` +- `WORKFLOW_NODE_EDITOR_FF_REQUEST_FORM` +- `CUSTOM_WORKFLOW_FF_REQUEST_FORM` + +#### **Redis** + +- `REDIS_DSN` + +#### **Other** + +- `IMAGE_PULL_SECRETS` +- `PRIVATE_KEY_SECRETS_ADAPTER_TYPE` +- `PRIVATE_KEY_SECRETS_ADAPTER_AWS_REGION` +- `SECRETS_ADAPTER_TYPE` +- `SECRETS_ADAPTER_AWS_REGION` + +The preceding Secrets and ConfigMaps must be added to the following files: + +| File name | Type | Resource name | Namespace | Data keys +| --- | --- | --- | --- | --- | +| `data-broker-env-cm.yaml` | ConfigMap | `data-broker-env` | `api` | `JOB_STATUS_BUCKET_NAME`, `JOB_DB_BUCKET_NAME`, `BLOB_STORAGE_ADAPTER_TYPE` | +| `data-broker-env-secret.yaml` | Secret | `data-broker-env` | `api` | ` BLOB_STORAGE_ADAPTER_ACCESS_KEY_ID`, `BLOB_STORAGE_ADAPTER_REGION_NAME`, `BLOB_STORAGE_ADAPTER_SECRET_ACCESS_KEY` | +| `dataplane-api-env-cm.yaml` | Secret | `dataplane-api-env` | `api` | ` DB_PASSWORD`, `DB_USERNAME`, `DB_HOST`, `DB_NAME` | +| `etl-operator-env-cm.yaml` | ConfigMap | `etl-operator-env` | `etl-operator` | ` BLOB_STORAGE_ADAPTER_BUCKET`, `JOB_STATUS_BUCKET_NAME`, `JOB_DB_BUCKET_NAME`, `BLOB_STORAGE_ADAPTER_TYPE`, `ENV`, `ENVIRONMENT`, `REDIS_DSN`, `ETL_API_BLOB_STORAGE_ADAPTER_BUCKET`, `ETL_API_BLOB_STORAGE_ADAPTER_TYPE`, `ETL_API_DB_REMOTE_BUCKET_NAME`, `ETL_API_JOB_STATUS_DEST_BUCKET_NAME (x2)`, `ETL_BLOB_CACHE_BUCKET_NAME`, `IMAGE_PULL_SECRETS`, `JOB_ENV`, `JOB_ENVIRONMENT`, `JOB_OTEL_EXPORTER_OTLP_ENDPOINT`, `JOB_OTEL_METRICS_EXPORTER`, `JOB_OTEL_TRACES_EXPORTER`, `OTEL_EXPORTER_OTLP_ENDPOINT`, `OTEL_METRICS_EXPORTER`, `OTEL_TRACES_EXPORTER`, `UNSTRUCTURED_API_URL` | +| `etl-operator-env-secret.yaml` | Secret | `etl-operator-env` | `etl-operator` | ` BLOB_STORAGE_ADAPTER_ACCESS_KEY_ID`, `BLOB_STORAGE_ADAPTER_REGION_NAME`, `BLOB_STORAGE_ADAPTER_SECRET_ACCESS_KEY` | +| `frontend-env-cm.yaml` | ConfigMap | `frontend-env` | `www` | `API_BASE_URL`, `API_CLIENT_BASE_URL`, `API_URL`, `APM_SERVICE_NAME`, `APM_SERVICE_NAME_CLIENT`, `AUTH_STRATEGY`, `ENV`, `FRONTEND_BASE_URL`, `KEYCLOAK_CALLBACK_URL`, `KEYCLOAK_CLIENT_ID`, `KEYCLOAK_DOMAIN`, `KEYCLOAK_REALM`, `KEYCLOAK_SSL_ENABLED`, `KEYCLOAK_TRUST_ISSUER`, `PUBLIC_BASE_URL`, `PUBLIC_RELEASE_CHANNEL`, `SENTRY_DSN`, `SENTRY_SAMPLE_RATE`, `WORKFLOW_NODE_EDITOR_FF_REQUEST_FORM`, `CUSTOM_WORKFLOW_FF_REQUEST_FORM` | +| `frontend-env-secret.yaml` | Secret | `frontend-env` | `www` | `API_BEARER_TOKEN`, `KEYCLOAK_ADMIN_SECRET`, `KEYCLOAK_CLIENT_SECRET`, `SESSION_SECRET`, `SHARED_SECRET` | +| `keycloak-secret.yaml` | Secret | `phasetwo-keycloak-env` | `www` | `KEYCLOAK_ADMIN`, `KEYCLOAK_ADMIN_PASSWORD` | +| `platform-api-env-cm.yaml` | ConfigMap | `platform-api-env` | `api` | `JWKS_URL`, `JWT_ISSUER`, `JWT_AUDIENCE`, `SINGLE_PLANE_DEPLOYMENT` | +| `platform-api-env-secret.yaml` | Secret | `platform-api-env` | `api` | `DB_PASSWORD`, `DB_USERNAME`, `DB_HOST`, `DB_NAME`, `DB_DATABASE`, `JWT_SECRET_KEY`, `AUTH_STRATEGY` | +| `recommender-env-cm.yaml` | ConfigMap | `recommender-env` | `recommender` | `BLOB_STORAGE_ADAPTER_TYPE`, `ETL_BLOB_CACHE_BUCKET_NAME` | +| `recommender-env-secret.yaml` | Secret | `recommender-env` | `recommender` | `BLOB_STORAGE_ADAPTER_ACCESS_KEY_ID`, `BLOB_STORAGE_ADAPTER_REGION_NAME`, `BLOB_STORAGE_ADAPTER_SECRET_ACCESS_KEY` | +| `secret-provider-api-env-cm.yaml` | ConfigMap | `secrets-provider-api-env` | `secrets` | `ENV`, `ENVIRONMENT`, `OTEL_EXPORTER_OTLP_ENDPOINT`, `OTEL_METRICS_EXPORTER`, `OTEL_TRACES_EXPORTER`, `PRIVATE_KEY_SECRETS_ADAPTER_AWS_REGION`, `PRIVATE_KEY_SECRETS_ADAPTER_TYPE`, `SECRETS_ADAPTER_AWS_REGION`, `SECRETS_ADAPTER_TYPE` | +| `secret-provider-api-env-secret.yaml` | Secret | `secrets-provider-api-env` | `secrets` | `BLOB_STORAGE_ADAPTER_ACCESS_KEY_ID`, `BLOB_STORAGE_ADAPTER_REGION_NAME`, `BLOB_STORAGE_ADAPTER_SECRET_ACCESS_KEY` | +| `usage-collector-env-secret.yaml` | Secret | `usage-collector-env` | `api` | `DB_PASSWORD`, `DB_USERNAME`, `DB_HOST`, `DB_NAME`, `BLOB_STORAGE_ADAPTER_TYPE` | + +For example, for the `etl-operator-env-cm.yaml` [ConfigMap](https://kubernetes.io/docs/concepts/configuration/configmap/) file, the contents would look like this: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: data-broker-env + namespace: api +data: + JOB_STATUS_BUCKET_NAME: "<your-value>" + JOB_DB_BUCKET_NAME: "<your-value>" + BLOB_STORAGE_ADAPTER_TYPE: "<your-value>" +``` + +For the `etl-operator-env-secret.yaml` [Secret](https://kubernetes.io/docs/concepts/configuration/secret/) file, the contents would look like this: + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: data-broker-env + namespace: api +type: Opaque +stringData: + BLOB_STORAGE_ADAPTER_ACCESS_KEY_ID: "<your-value>" + BLOB_STORAGE_ADAPTER_REGION_NAME: "<your-value>" + BLOB_STORAGE_ADAPTER_SECRET_ACCESS_KEY: "<your-value>" +``` diff --git a/self-hosted/aws/overview.mdx b/self-hosted/aws/overview.mdx new file mode 100644 index 00000000..7243b1de --- /dev/null +++ b/self-hosted/aws/overview.mdx @@ -0,0 +1,22 @@ +--- +title: Self-hosting Unstructured on Amazon Web Services (AWS) +sidebarTitle: Overview +--- + +## Getting started + +To get started with a self-hosted deployment, your organization must first sign a self-hosting agreement with Unstructured. +This agreement outlines the terms and conditions for your organization to use Unstructured in a self-hosted environment. + +To begin the self-hosting agreement process, contact your Unstructured sales representative, email Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or fill out the +[contact form](https://unstructured.io/contact) on the Unstructured website. + +After your organization has signed the self-hosting agreement, a member of the Unstructured technical enablement team will reach out to you to begin the +deployment onboarding process. To streamline this process, you are encouraged to begin setting up your target environment as soon as possible. To do this, +see the [onboarding checkist](/self-hosted/aws/onboard). + +## Questions? Need help? + +Contact your Unstructured sales representative, email Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or fill out the +[contact form](https://unstructured.io/contact) on the Unstructured website, and a member of the Unstructured sales or support teams +will get back to you as soon as possible. \ No newline at end of file diff --git a/self-hosted/azure/onboard.mdx b/self-hosted/azure/onboard.mdx new file mode 100644 index 00000000..97a6d1bc --- /dev/null +++ b/self-hosted/azure/onboard.mdx @@ -0,0 +1,300 @@ +--- +title: Azure self-hosted onboarding checklist +sidebarTitle: Onboarding +--- + +<Note> + To proceed with a self-hosted deployment, your organization must first sign a self-hosting agreement with Unstructured. + + If you have not yet signed this agreement, stop here, and begin the self-hosting agreement process by contacting your Unstructured sales representative, emailing + Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or filling out the [contact form](https://unstructured.io/contact) + on the Unstructured website. +</Note> + +After your organization has signed the self-hosting agreement with Unstructured, a member of the Unstructured technical enablement team will reach out to you to begin the +deployment onboarding process. To streamline this process, you are encouraged to begin setting up your target environment as soon as possible. To do this, you +must first set up your Azure account as follows. + +## Questions? Need help? + +If you have questions or need help as you go, contact your Unstructured sales representative or technical enablement contact. If you do not know who they are, +email Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or fill out the +[contact form](https://unstructured.io/contact) on the Unstructured website, and a member of the Unstructured sales or technical enablement teams +will get back to you as soon as possible. + +## Onboarding checklist + +Set up the following infrastructure within your Azure account for Unstructured to deploy the Unstructured UI and API into. + +### **Azure subscription and resource group** + +- **Subscription** + + - Ensure you have access to a valid Azure subscription + - You will need the `subscription_id` if deploying via CLI or Pulumi + +- **Resource Group** + + - Name: `u10d-{env}-rg` + - Region: e.g., `eastus2` + - All resources (VNet, AKS, PostgreSQL, Storage, etc.) will be created inside this group + +### **VNet and networking** + +- **Virtual Network (VNet)** + + - Address space: `10.0.0.0/16` + - DNS Hostnames: Enabled + - DNS Support: Enabled + +- **Internet Access** + + - Handled via Azure's default gateway and public IPs + +- **Public Subnet** + + - Address: `10.0.0.0/24` + - Assign Public IP: true + - Availability Zone: `${region}a` + +- **NAT Gateway + Public IP** + + - NAT Gateway in the public subnet + - Public IP resource attached + +- **Private Subnets (x2)** + + - Addresses: `10.0.1.0/24`, `10.0.2.0/24` + - AZs: `${region}a` and `${region}b` + +- **Route Tables** + + - Public: route `0.0.0.0/0` via internet + - Private: route `0.0.0.0/0` via NAT Gateway + +### **Managed identities and RBAC** + +- **AKS Cluster Managed Identity** + + - Assign roles: + + - `Contributor` or more scoped role + - `Network Contributor` + +- **Node Pool Managed Identity** + + - Assign roles: + + - `Monitoring Metrics Publisher` + - `AcrPull` (if using ACR) + - `Storage Blob Data Reader` + +- **Workload Identity Bindings (x3)** + + - Namespaces: `recommender`, `etl-operator`, `data-broker` + - Use Azure AD Workload Identity Federation + - Assign `Storage Blob Data Contributor` to required containers + +### **AKS Cluster** + +- **Control Plane** + + - Version: `1.31` or higher + - API authorized IPs: optional + - Private cluster networking recommended + +- **Node Pool** + + - VM Size: `Standard_D16s_v5` + - Disk Size: 100 GB + - Desired Size: 2 (min: 2, max: 5) + - SSH: Enabled via key pair + - SSH key exported in PEM format + +- **NSGs (Network Security Groups)** + + - Allow intra-cluster traffic (`10.0.0.0/16`) + - Allow all egress + +#### **Kubernetes Add-ons** + +Install via Helm or YAML: + +- **Workload Identity Webhook** +- **Metrics Server** — `v0.7.2` +- **Azure Disk CSI Driver** + + - Provisioner: `disk.csi.azure.com` + +### **Storage class** + +```yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: azure-disk-sc + annotations: + storageclass.kubernetes.io/is-default-class: "true" +provisioner: disk.csi.azure.com +parameters: + skuName: Premium_LRS + kind: Managed +volumeBindingMode: WaitForFirstConsumer +``` + +### Secrets and ConfigMaps + +After your infrastructure is set up, but before Unstructured can deploy the Unstructured UI and API into your insfrastructure, +Unstructured will need to know the values of the following Secrets and ConfigMaps. These must be provided to Unstructured as a +set of YAML files in Kubernetes [Secret](https://kubernetes.io/docs/concepts/configuration/secret/) and +[ConfigMap](https://kubernetes.io/docs/concepts/configuration/configmap/) format. + +Capture these during setup + +- DB host, username, password +- Container names +- SSH private key +- Auth secrets + +The Secrets are as follows. + +#### Blob storage credentials (Azure) + +- `BLOB_STORAGE_ADAPTER_ACCOUNT_NAME` +- `BLOB_STORAGE_ADAPTER_ACCOUNT_KEY` +- `BLOB_STORAGE_ADAPTER_CONTAINER_REGION` (optional) + +#### Database credentials + +- `DB_USERNAME` +- `DB_PASSWORD` +- `DB_HOST` +- `DB_NAME` +- `DB_DATABASE` + +#### Authentication + +- `JWT_SECRET_KEY` +- `AUTH_STRATEGY` +- `SESSION_SECRET` +- `SHARED_SECRET` +- `KEYCLOAK_CLIENT_SECRET` +- `KEYCLOAK_ADMIN_SECRET` +- `KEYCLOAK_ADMIN` +- `KEYCLOAK_ADMIN_PASSWORD` +- `API_BEARER_TOKEN` + +The ConfigMaps are as follows. + +#### Blob storage settings + +- `BLOB_STORAGE_ADAPTER_TYPE`: `azure` +- `BLOB_STORAGE_ADAPTER_BUCKET` +- `ETL_BLOB_CACHE_BUCKET_NAME` +- `ETL_API_BLOB_STORAGE_ADAPTER_BUCKET` +- `ETL_API_BLOB_STORAGE_ADAPTER_TYPE`: `azure` +- `ETL_API_DB_REMOTE_BUCKET_NAME` +- `ETL_API_JOB_STATUS_DEST_BUCKET_NAME` +- `JOB_STATUS_BUCKET_NAME` +- `JOB_DB_BUCKET_NAME` + +#### Environment + +- `ENV`, `ENVIRONMENT` +- `JOB_ENV`, `JOB_ENVIRONMENT` + +#### Observability and OpenTelementry (OTel) + +- `JOB_OTEL_EXPORTER_OTLP_ENDPOINT` +- `JOB_OTEL_METRICS_EXPORTER` +- `JOB_OTEL_TRACES_EXPORTER` +- `OTEL_EXPORTER_OTLP_ENDPOINT` +- `OTEL_METRICS_EXPORTER` +- `OTEL_TRACES_EXPORTER` + +#### Unstructured API and authentication + +- `UNSTRUCTURED_API_URL` +- `JWKS_URL` +- `JWT_ISSUER` +- `JWT_AUDIENCE` +- `SINGLE_PLANE_DEPLOYMENT` + +#### Front end and dashboard + +- `API_BASE_URL` +- `API_CLIENT_BASE_URL` +- `API_URL` +- `APM_SERVICE_NAME` +- `APM_SERVICE_NAME_CLIENT` +- `AUTH_STRATEGY` +- `FRONTEND_BASE_URL` +- `KEYCLOAK_CALLBACK_URL` +- `KEYCLOAK_CLIENT_ID` +- `KEYCLOAK_DOMAIN` +- `KEYCLOAK_REALM` +- `KEYCLOAK_SSL_ENABLED` +- `KEYCLOAK_TRUST_ISSUER` +- `PUBLIC_BASE_URL` +- `PUBLIC_RELEASE_CHANNEL` + +#### Redis + +- `REDIS_DSN` + +#### Other + +- `IMAGE_PULL_SECRETS` +- `PRIVATE_KEY_SECRETS_ADAPTER_TYPE`: `azure` +- `PRIVATE_KEY_SECRETS_ADAPTER_AZURE_REGION` +- `SECRETS_ADAPTER_TYPE`: `azure` +- `SECRETS_ADAPTER_AZURE_REGION` + +The preceding Secrets and ConfigMaps must be added to the following files: + +| File Name | Type | Resource name | Namespace | Data keys | +| --- | --- | --- | --- | --- | +| `data-broker-env-cm.yaml` | ConfigMap | `data-broker-env` | `api` | `JOB_STATUS_BUCKET_NAME`, `JOB_DB_BUCKET_NAME`, `BLOB_STORAGE_ADAPTER_TYPE` | +| `data-broker-env-secret.yaml` | Secret | `data-broker-env` | `api` | `BLOB_STORAGE_ADAPTER_ACCOUNT_NAME`, `BLOB_STORAGE_ADAPTER_ACCOUNT_KEY`, `BLOB_STORAGE_ADAPTER_CONTAINER_REGION` | +| `dataplane-api-env-cm.yaml` | Secret | `dataplane-api-env` | `api` | `DB_PASSWORD`, `DB_USERNAME`, `DB_HOST`, `DB_NAME` | +| `etl-operator-env-cm.yaml` | ConfigMap | `etl-operator-env` | `etl-operator` | `BLOB_STORAGE_ADAPTER_BUCKET`, `JOB_STATUS_BUCKET_NAME`, `JOB_DB_BUCKET_NAME`, `BLOB_STORAGE_ADAPTER_TYPE`, `ENV`, `ENVIRONMENT`, `REDIS_DSN`, `ETL_API_BLOB_STORAGE_ADAPTER_BUCKET`, `ETL_API_BLOB_STORAGE_ADAPTER_TYPE`, `ETL_API_DB_REMOTE_BUCKET_NAME`, `ETL_API_JOB_STATUS_DEST_BUCKET_NAME` (x2), `ETL_BLOB_CACHE_BUCKET_NAME`, `IMAGE_PULL_SECRETS`, `JOB_ENV`, `JOB_ENVIRONMENT`, `JOB_OTEL_EXPORTER_OTLP_ENDPOINT`, `JOB_OTEL_METRICS_EXPORTER`, `JOB_OTEL_TRACES_EXPORTER`, `OTEL_EXPORTER_OTLP_ENDPOINT`, `OTEL_METRICS_EXPORTER`, `OTEL_TRACES_EXPORTER`, `UNSTRUCTURED_API_URL` | +| `etl-operator-env-secret.yaml` | Secret | `etl-operator-env` | `etl-operator` | `BLOB_STORAGE_ADAPTER_ACCOUNT_NAME`, `BLOB_STORAGE_ADAPTER_ACCOUNT_KEY`, `BLOB_STORAGE_ADAPTER_CONTAINER_REGION` | +| `frontend-env-cm.yaml` | ConfigMap | `frontend-env` | `www` | `API_BASE_URL`, `API_CLIENT_BASE_URL`, `API_URL`, `APM_SERVICE_NAME`, `APM_SERVICE_NAME_CLIENT`, `AUTH_STRATEGY`, `ENV`, `FRONTEND_BASE_URL`, `KEYCLOAK_CALLBACK_URL`, `KEYCLOAK_CLIENT_ID`, `KEYCLOAK_DOMAIN`, `KEYCLOAK_REALM`, `KEYCLOAK_SSL_ENABLED`, `KEYCLOAK_TRUST_ISSUER`, `PUBLIC_BASE_URL`, `PUBLIC_RELEASE_CHANNEL`, `SENTRY_DSN`, `SENTRY_SAMPLE_RATE`, `WORKFLOW_NODE_EDITOR_FF_REQUEST_FORM`, `CUSTOM_WORKFLOW_FF_REQUEST_FORM` | +| `frontend-env-secret.yaml` | Secret | `frontend-env` | `www` | `API_BEARER_TOKEN`, `KEYCLOAK_ADMIN_SECRET`, `KEYCLOAK_CLIENT_SECRET`, `SESSION_SECRET`, `SHARED_SECRET` | +| `keycloak-secret.yaml` | Secret | `phasetwo-keycloak-env` | `www` | `KEYCLOAK_ADMIN`, `KEYCLOAK_ADMIN_PASSWORD` | +| `platform-api-env-cm.yaml` | ConfigMap | `platform-api-env` | `api` | `JWKS_URL`, `JWT_ISSUER`, `JWT_AUDIENCE`, `SINGLE_PLANE_DEPLOYMENT` | +| `platform-api-env-secret.yaml` | Secret | `platform-api-env` | `api` | `DB_PASSWORD`, `DB_USERNAME`, `DB_HOST`, `DB_NAME`, `DB_DATABASE`, `JWT_SECRET_KEY`, `AUTH_STRATEGY` | +| `recommender-env-cm.yaml` | ConfigMap | `recommender-env` | `recommender` | `BLOB_STORAGE_ADAPTER_TYPE`, `ETL_BLOB_CACHE_BUCKET_NAME` | +| `recommender-env-secret.yaml` | Secret | `recommender-env` | `recommender` | `BLOB_STORAGE_ADAPTER_ACCOUNT_NAME`, `BLOB_STORAGE_ADAPTER_ACCOUNT_KEY`, `BLOB_STORAGE_ADAPTER_CONTAINER_REGION` | +| `secret-provider-api-env-cm.yaml` | ConfigMap | `secrets-provider-api-env` | `secrets` | `ENV`, `ENVIRONMENT`, `OTEL_EXPORTER_OTLP_ENDPOINT`, `OTEL_METRICS_EXPORTER`, `OTEL_TRACES_EXPORTER`, `PRIVATE_KEY_SECRETS_ADAPTER_AZURE_REGION`, `PRIVATE_KEY_SECRETS_ADAPTER_TYPE`, `SECRETS_ADAPTER_AZURE_REGION`, `SECRETS_ADAPTER_TYPE` | +| `secret-provider-api-env-secret.yaml` | Secret | `secrets-provider-api-env` | `secrets` | `BLOB_STORAGE_ADAPTER_ACCOUNT_NAME`, `BLOB_STORAGE_ADAPTER_ACCOUNT_KEY`, `BLOB_STORAGE_ADAPTER_CONTAINER_REGION` | +| `usage-collector-env-secret.yaml` | Secret | `usage-collector-env` | `api` | `DB_PASSWORD`, `DB_USERNAME`, `DB_HOST`, `DB_NAME`, `BLOB_STORAGE_ADAPTER_TYPE` | + +For example, for the `data-broker-env-cm.yaml` ConfigMap file, the contents would look like this: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: data-broker-env + namespace: api +data: + JOB_STATUS_BUCKET_NAME: "<your-value>" + JOB_DB_BUCKET_NAME: "<your-value>" + BLOB_STORAGE_ADAPTER_TYPE: "<your-value>" +``` + +The `data-broker-env-secret.yaml` Secret file would look like this: + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: data-broker-env + namespace: api +type: Opaque +stringData: + BLOB_STORAGE_ADAPTER_ACCOUNT_NAME: "<your-value>" + BLOB_STORAGE_ADAPTER_ACCOUNT_KEY: "<your-value>" + BLOB_STORAGE_ADAPTER_CONTAINER_REGION: "<your-value>" +``` \ No newline at end of file diff --git a/self-hosted/azure/overview.mdx b/self-hosted/azure/overview.mdx new file mode 100644 index 00000000..5624a86f --- /dev/null +++ b/self-hosted/azure/overview.mdx @@ -0,0 +1,22 @@ +--- +title: Self-hosting Unstructured on Azure +sidebarTitle: Overview +--- + +## Getting started + +To get started with a self-hosted deployment, your organization must first sign a self-hosting agreement with Unstructured. +This agreement outlines the terms and conditions for your organization to use Unstructured in a self-hosted environment. + +To begin the self-hosting agreement process, contact your Unstructured sales representative, email Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or fill out the +[contact form](https://unstructured.io/contact) on the Unstructured website. + +After your organization has signed the self-hosting agreement, a member of the Unstructured technical enablement team will reach out to you to begin the +deployment onboarding process. To streamline this process, you are encouraged to begin setting up your target environment as soon as possible. To do this, +see the [onboarding checkist](/self-hosted/azure/onboard). + +## Questions? Need help? + +Contact your Unstructured sales representative, email Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or fill out the +[contact form](https://unstructured.io/contact) on the Unstructured website, and a member of the Unstructured sales or support teams +will get back to you as soon as possible. \ No newline at end of file diff --git a/self-hosted/bare-metal/onboard.mdx b/self-hosted/bare-metal/onboard.mdx new file mode 100644 index 00000000..8701c36e --- /dev/null +++ b/self-hosted/bare-metal/onboard.mdx @@ -0,0 +1,25 @@ +--- +title: Bare metal self-hosted onboarding checklist +sidebarTitle: Onboarding +--- + +<Note> + To proceed with a self-hosted deployment, your organization must first sign a self-hosting agreement with Unstructured. + + If you have not yet signed this agreement, stop here, and begin the self-hosting agreement process by emailing + Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or fill out the [contact form](https://unstructured.io/contact) + on the Unstructured website. +</Note> + +After your organization has signed the self-hosting agreement with Unstructured, a member of the Unstructured technical enablement team will reach out to you to begin the +deployment onboarding process. To streamline this process, you are encouraged to begin setting up your target environment as soon as possible. To do this, you +must first set up your hardware as follows. + +## Questions? Need help? + +If you have questions or need help as you go, contact your Unstructured sales representative or technical enablement contact. If you do not know who they are, +email Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or fill out the +[contact form](https://unstructured.io/contact) on the Unstructured website, and a member of the Unstructured sales or technical enablement teams +will get back to you as soon as possible. + +## Onboarding checklist \ No newline at end of file diff --git a/self-hosted/bare-metal/overview.mdx b/self-hosted/bare-metal/overview.mdx new file mode 100644 index 00000000..71c0a206 --- /dev/null +++ b/self-hosted/bare-metal/overview.mdx @@ -0,0 +1,22 @@ +--- +title: Self-hosting Unstructured on your own hardware +sidebarTitle: Overview +--- + +## Getting started + +To get started with a self-hosted deployment, your organization must first sign a self-hosting agreement with Unstructured. +This agreement outlines the terms and conditions for your organization to use Unstructured in a self-hosted environment. + +To begin the self-hosting agreement process, email Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or fill out the +[contact form](https://unstructured.io/contact) on the Unstructured website. + +After your organization has signed the self-hosting agreement, a member of the Unstructured technical enablement team will reach out to you to begin the +deployment onboarding process. To streamline this process, you are encouraged to begin setting up your target environment as soon as possible. To do this, +see the [onboarding checkist](/self-hosted/bare-metal/onboard). + +## Questions? Need help? + +Email Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or fill out the +[contact form](https://unstructured.io/contact) on the Unstructured website, and a member of the Unstructured sales or support teams +will get back to you as soon as possible. \ No newline at end of file diff --git a/self-hosted/gcp/onboard.mdx b/self-hosted/gcp/onboard.mdx new file mode 100644 index 00000000..1f7069b2 --- /dev/null +++ b/self-hosted/gcp/onboard.mdx @@ -0,0 +1,330 @@ +--- +title: GCP self-hosted onboarding checklist +sidebarTitle: Onboarding +--- + +<Note> + To proceed with a self-hosted deployment, your organization must first sign a self-hosting agreement with Unstructured. + + If you have not yet signed this agreement, stop here, and begin the self-hosting agreement process by contacting your Unstructured sales representative, emailing + Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or filling out the [contact form](https://unstructured.io/contact) + on the Unstructured website. +</Note> + +After your organization has signed the self-hosting agreement with Unstructured, a member of the Unstructured technical enablement team will reach out to you to begin the +deployment onboarding process. To streamline this process, you are encouraged to begin setting up your target environment as soon as possible. To do this, you +must first set up your GCP account as follows. + +## Questions? Need help? + +If you have questions or need help as you go, contact your Unstructured sales representative or technical enablement contact. If you do not know who they are, +email Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or fill out the +[contact form](https://unstructured.io/contact) on the Unstructured website, and a member of the Unstructured sales or technical enablement teams +will get back to you as soon as possible. + +## Onboarding checklist + +Set up the following infrastructure within your GCP account for Unstructured to deploy the Unstructured UI and API into. + +### **VPC and networking (GCP equivalent)** + +- **VPC Network** + + - Name: `u10d-platform` + - Subnet Mode: *Custom* + - CIDR: `10.0.0.0/16` + - DNS: Internal DNS supported by default + +- **Internet Gateway** + + - GCP provides implicit internet access via default internet gateway + + (No need to explicitly create) + +- **Public Subnet** + + - Subnet: `public-subnet` — `10.0.0.0/24` + - Region: `${region}` + - Enable external IPs on VM instances for internet access + +- **NAT Gateway** + + - Use **Cloud NAT** attached to a **Cloud Router** in public subnet + - Needed to provide egress internet access to private subnet instances + +- **Private Subnets (x2)** + + - `private-subnet-a`: `10.0.1.0/24`, region `${region}-a` + - `private-subnet-b`: `10.0.2.0/24`, region `${region}-b` + +- **Routes** + + - Public subnet: default route `0.0.0.0/0` to Internet Gateway (via external IPs) + - Private subnets: route `0.0.0.0/0` via Cloud NAT + +### **IAM roles and policies** + +- **GKE Cluster IAM Service Account** + + - Grant roles: + + - `roles/container.clusterAdmin` + - `roles/compute.networkAdmin` + +- **GKE Node IAM Service Account** + + - Grant roles: + + - `roles/container.nodeServiceAccount` + - `roles/compute.viewer` + - `roles/storage.objectViewer` + +- **Workload Identity IAM Bindings (x3)** + + - Namespaces: `recommender`, `etl-operator`, `data-broker` + - Use **Workload Identity Federation** + - Bind GCP IAM Service Accounts to Kubernetes service accounts + - Grant `roles/storage.objectAdmin` for access to GCS buckets + +### **GKE cluster** + +- **Control Plane** + + - Version: `1.31` or higher + - Private Cluster: *Enabled* + - Master Authorized Networks: your IP(s) + - Enable Public Endpoint Access: Yes + +- **Node Pool** + + - Machine type: `n2-standard-16` + - Disk: 100GB, SSD (default boot disk) + - Node count: min 2, max 5, autoscaling enabled + - SSH access: via OS Login + SSH keys + - SSH key: Add public key to instance metadata + +- **Firewall Rules** + + - Allow: + + - Internal: `10.0.0.0/16` + - Egress: all + + - Kubernetes master access to nodes + +### **Kubernetes add-ons (installed via `kubectl` or Helm)** + +- **Workload Identity Config** +- **Metrics Server** + + - Deployed manually (same version: `v0.7.2`) + +- **GCP CSI Driver** + + - Provisioner: `pd.csi.storage.gke.io` + - Role binding needed for controller SA + +### **Storage class** + +```yaml +apiVersion: [storage.k8s.io/v1](http://storage.k8s.io/v1) +kind: StorageClass +metadata: + name: pd-ssd + annotations: + [storageclass.kubernetes.io/is-default-class:](http://storageclass.kubernetes.io/is-default-class:) "true" +provisioner: [pd.csi.storage.gke.io](http://pd.csi.storage.gke.io/) +parameters: + type: pd-ssd + encrypted: "true" +volumeBindingMode: WaitForFirstConsumer +``` + +### **Cloud SQL (Postgres)** + +- **Private IP-enabled Cloud SQL instance** + + - Engine: Postgres 16 + - Size: `db-f1-micro` (or `db-custom-1-3840`) + - Storage: 20GB + - Credentials: Username/password + - Private network: Use the private VPC + +- **Cloud SQL Auth Proxy** or private VPC peering to connect from GKE + +### **GCS Buckets** + +- Buckets: + + - `u10d-{stack_name}-etl-blob-cache` + - `u10d-{stack_name}-etl-job-db` + - `u10d-{stack_name}-etl-job-status` + - `u10d-{stack_name}-job-files` + +- Config: + + - Versioning: Enabled + - Encryption: Default (Google-managed key or CMEK if needed) + - Lifecycle rule: Auto-delete / force destroy if needed (optional) + +### **Keys** + +- **SSH Key Pair** + + - Generate manually (`ssh-keygen -t rsa -b 4096`) + - Upload public key to project metadata or OS Login + - Export private key as PEM for automation + +### Secrets and ConfigMaps + +After your infrastructure is set up, but before Unstructured can deploy the Unstructured UI and API into your insfrastructure, +Unstructured will need to know the values of the following Secrets and configuration mappings (also known as _ConfigMaps_). + +The Secrets are as follows. + +#### **Blob storage credentials** + +- `BLOB_STORAGE_ADAPTER_GCP_SERVICE_ACCOUNT_KEY_JSON` +- `BLOB_STORAGE_ADAPTER_REGION_NAME` + +#### **Database credentials** + +- `DB_USERNAME` +- `DB_PASSWORD` +- `DB_HOST` +- `DB_NAME` +- `DB_DATABASE` (used in `platform-api` only) + +#### **Authentication** + +- `JWT_SECRET_KEY` +- `AUTH_STRATEGY` (sometimes encoded, sometimes not) +- `SESSION_SECRET` +- `SHARED_SECRET` +- `KEYCLOAK_CLIENT_SECRET` +- `KEYCLOAK_ADMIN_SECRET` +- `KEYCLOAK_ADMIN` +- `KEYCLOAK_ADMIN_PASSWORD` +- `API_BEARER_TOKEN` + +The ConfigMaps are as follows. + +#### **Blob storage settings** + +- `BLOB_STORAGE_ADAPTER_TYPE` (always `gcp` for GCP) +- `BLOB_STORAGE_ADAPTER_BUCKET` +- `ETL_BLOB_CACHE_BUCKET_NAME` +- `ETL_API_BLOB_STORAGE_ADAPTER_BUCKET` +- `ETL_API_BLOB_STORAGE_ADAPTER_TYPE` +- `ETL_API_DB_REMOTE_BUCKET_NAME` +- `ETL_API_JOB_STATUS_DEST_BUCKET_NAME` +- `JOB_STATUS_BUCKET_NAME` +- `JOB_DB_BUCKET_NAME` + +#### **Environment** + +- `ENV` +- `ENVIRONMENT` +- `JOB_ENV` +- `JOB_ENVIRONMENT` + +#### **Observability and OpenTelemetry (OTel)** + +- `JOB_OTEL_EXPORTER_OTLP_ENDPOINT` +- `JOB_OTEL_METRICS_EXPORTER` +- `JOB_OTEL_TRACES_EXPORTER` +- `OTEL_EXPORTER_OTLP_ENDPOINT` +- `OTEL_METRICS_EXPORTER` +- `OTEL_TRACES_EXPORTER` + +#### **Unstructured API and authentication** + +- `UNSTRUCTURED_API_URL` +- `JWKS_URL` +- `JWT_ISSUER` +- `JWT_AUDIENCE` +- `SINGLE_PLANE_DEPLOYMENT` + +#### **Front end and dashboard** + +- `API_BASE_URL` +- `API_CLIENT_BASE_URL` +- `API_URL` +- `APM_SERVICE_NAME` +- `APM_SERVICE_NAME_CLIENT` +- `AUTH_STRATEGY` +- `FRONTEND_BASE_URL` +- `KEYCLOAK_CALLBACK_URL` +- `KEYCLOAK_CLIENT_ID` +- `KEYCLOAK_DOMAIN` +- `KEYCLOAK_REALM` +- `KEYCLOAK_SSL_ENABLED` +- `KEYCLOAK_TRUST_ISSUER` +- `PUBLIC_BASE_URL` +- `PUBLIC_RELEASE_CHANNEL` + +#### **Sentry & Feature Flags** + +- `SENTRY_DSN` +- `SENTRY_SAMPLE_RATE` +- `WORKFLOW_NODE_EDITOR_FF_REQUEST_FORM` +- `CUSTOM_WORKFLOW_FF_REQUEST_FORM` + +#### **Redis** + +- `REDIS_DSN` + +#### **Other** + +- `IMAGE_PULL_SECRETS` +- `PRIVATE_KEY_SECRETS_ADAPTER_TYPE` +- `PRIVATE_KEY_SECRETS_ADAPTER_GCP_REGION` +- `SECRETS_ADAPTER_TYPE` +- `SECRETS_ADAPTER_GCP_REGION` + +The preceding Secrets and ConfigMaps must be added to the following files: + +| File name | Type | Resource name | Namespace | Data keys +| --- | --- | --- | --- | --- | +| `data-broker-env-cm.yaml` | ConfigMap | `data-broker-env` | `api` | `JOB_STATUS_BUCKET_NAME`, `JOB_DB_BUCKET_NAME`, `BLOB_STORAGE_ADAPTER_TYPE` | +| `data-broker-env-secret.yaml` | Secret | `data-broker-env` | `api` | `BLOB_STORAGE_ADAPTER_GCP_SERVICE_ACCOUNT_KEY_JSON`, `BLOB_STORAGE_ADAPTER_REGION_NAME` | +| `dataplane-api-env-cm.yaml` | Secret | `dataplane-api-env` | `api` | `DB_PASSWORD`, `DB_USERNAME`, `DB_HOST`, `DB_NAME` | +| `etl-operator-env-cm.yaml` | ConfigMap | `etl-operator-env` | `etl-operator` | `BLOB_STORAGE_ADAPTER_BUCKET`, `JOB_STATUS_BUCKET_NAME`, `JOB_DB_BUCKET_NAME`, `BLOB_STORAGE_ADAPTER_TYPE`, `ENV`, `ENVIRONMENT`, `REDIS_DSN`, `ETL_API_BLOB_STORAGE_ADAPTER_BUCKET`, `ETL_API_BLOB_STORAGE_ADAPTER_TYPE`, `ETL_API_DB_REMOTE_BUCKET_NAME`, `ETL_API_JOB_STATUS_DEST_BUCKET_NAME` (x2), `ETL_BLOB_CACHE_BUCKET_NAME`, `IMAGE_PULL_SECRETS`, `JOB_ENV`, `JOB_ENVIRONMENT`, `JOB_OTEL_EXPORTER_OTLP_ENDPOINT`, `JOB_OTEL_METRICS_EXPORTER`, `JOB_OTEL_TRACES_EXPORTER`, `OTEL_EXPORTER_OTLP_ENDPOINT`, `OTEL_METRICS_EXPORTER`, `OTEL_TRACES_EXPORTER`, `UNSTRUCTURED_API_URL` | +| `etl-operator-env-secret.yaml` | Secret | `etl-operator-env` | `etl-operator` | `BLOB_STORAGE_ADAPTER_GCP_SERVICE_ACCOUNT_KEY_JSON`, `BLOB_STORAGE_ADAPTER_REGION_NAME,` | +| `frontend-env-cm.yaml` | ConfigMap | `frontend-env` | `www` | `API_BASE_URL`, `API_CLIENT_BASE_URL`, `API_URL`, `APM_SERVICE_NAME`, `APM_SERVICE_NAME_CLIENT`, `AUTH_STRATEGY`, `ENV`, `FRONTEND_BASE_URL`, `KEYCLOAK_CALLBACK_URL`, `KEYCLOAK_CLIENT_ID`, `KEYCLOAK_DOMAIN`, `KEYCLOAK_REALM`, `KEYCLOAK_SSL_ENABLED`, `KEYCLOAK_TRUST_ISSUER`, `PUBLIC_BASE_URL`, `PUBLIC_RELEASE_CHANNEL`, `SENTRY_DSN`, `SENTRY_SAMPLE_RATE`, `WORKFLOW_NODE_EDITOR_FF_REQUEST_FORM`, `CUSTOM_WORKFLOW_FF_REQUEST_FORM` | +| `frontend-env-secret.yaml` | Secret | `frontend-env` | `www` | `API_BEARER_TOKEN`, `KEYCLOAK_ADMIN_SECRET`, `KEYCLOAK_CLIENT_SECRET`, `SESSION_SECRET`, `SHARED_SECRET` | +| `keycloak-secret.yaml` | Secret | `phasetwo-keycloak-env` | `www` | `KEYCLOAK_ADMIN`, `KEYCLOAK_ADMIN_PASSWORD` | +| `platform-api-env-cm.yaml` | ConfigMap | `platform-api-env` | `api` | `JWKS_URL`, `JWT_ISSUER`, `JWT_AUDIENCE`, `SINGLE_PLANE_DEPLOYMENT` | +| `platform-api-env-secret.yaml` | Secret | `platform-api-env` | `api` | `DB_PASSWORD`, `DB_USERNAME`, `DB_HOST`, `DB_NAME`, `DB_DATABASE`, `JWT_SECRET_KEY`, `AUTH_STRATEGY` | +| `recommender-env-cm.yaml` | ConfigMap | `recommender-env` | `recommender` | `BLOB_STORAGE_ADAPTER_TYPE`, `ETL_BLOB_CACHE_BUCKET_NAME` | +| `recommender-env-secret.yaml` | Secret | `recommender-env` | `recommender` | `BLOB_STORAGE_ADAPTER_GCP_SERVICE_ACCOUNT_KEY_JSON`, `BLOB_STORAGE_ADAPTER_REGION_NAME` | +| `secret-provider-api-env-cm.yaml` | ConfigMap | `secrets-provider-api-env` | `secrets` | `ENV`, `ENVIRONMENT`, `OTEL_EXPORTER_OTLP_ENDPOINT`, `OTEL_METRICS_EXPORTER`, `OTEL_TRACES_EXPORTER`, `PRIVATE_KEY_SECRETS_ADAPTER_GCP_REGION`, `PRIVATE_KEY_SECRETS_ADAPTER_TYPE`, `SECRETS_ADAPTER_GCP_REGION`, `SECRETS_ADAPTER_TYPE` | +| `secret-provider-api-env-secret.yaml` | Secret | `secrets-provider-api-env` | `secrets` | `BLOB_STORAGE_ADAPTER_GCP_SERVICE_ACCOUNT_KEY_JSON`, `BLOB_STORAGE_ADAPTER_REGION_NAME` | +| `usage-collector-env-secret.yaml` | Secret | `usage-collector-env` | `api` | `DB_PASSWORD`, `DB_USERNAME`, `DB_HOST`, `DB_NAME`, `BLOB_STORAGE_ADAPTER_TYPE` | + +For example, for the `data-broker-env-cm.yaml` [ConfigMap](https://kubernetes.io/docs/concepts/configuration/configmap/) file, the contents would look like this: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: data-broker-env + namespace: api +data: + JOB_STATUS_BUCKET_NAME: "<your-value>" + JOB_DB_BUCKET_NAME: "<your-value>" + BLOB_STORAGE_ADAPTER_TYPE: "<your-value>" +``` + +For the `data-broker-env-secret.yaml` [Secret](https://kubernetes.io/docs/concepts/configuration/secret/) file, the contents would look like this: + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: data-broker-env + namespace: api +type: Opaque +stringData: + BLOB_STORAGE_ADAPTER_GCP_SERVICE_ACCOUNT_KEY_JSON: "<your-value>" + BLOB_STORAGE_ADAPTER_REGION_NAME: "<your-value>" \ No newline at end of file diff --git a/self-hosted/gcp/overview.mdx b/self-hosted/gcp/overview.mdx new file mode 100644 index 00000000..6463dbb7 --- /dev/null +++ b/self-hosted/gcp/overview.mdx @@ -0,0 +1,22 @@ +--- +title: Self-hosting Unstructured on Google Cloud Platform (GCP) +sidebarTitle: Overview +--- + +## Getting started + +To get started with a self-hosted deployment, your organization must first sign a self-hosting agreement with Unstructured. +This agreement outlines the terms and conditions for your organization to use Unstructured in a self-hosted environment. + +To begin the self-hosting agreement process, contact your Unstructured sales representative, email Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or fill out the +[contact form](https://unstructured.io/contact) on the Unstructured website. + +After your organization has signed the self-hosting agreement, a member of the Unstructured technical enablement team will reach out to you to begin the +deployment onboarding process. To streamline this process, you are encouraged to begin setting up your target environment as soon as possible. To do this, +see the [onboarding checkist](/self-hosted/gcp/onboard). + +## Questions? Need help? + +Contact your Unstructured sales representative, email Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or fill out the +[contact form](https://unstructured.io/contact) on the Unstructured website, and a member of the Unstructured sales or support teams +will get back to you as soon as possible. \ No newline at end of file diff --git a/self-hosted/overview.mdx b/self-hosted/overview.mdx new file mode 100644 index 00000000..fcf1197b --- /dev/null +++ b/self-hosted/overview.mdx @@ -0,0 +1,36 @@ +--- +title: Self-hosting Unstructured +sidebarTitle: Overview +--- + +Unstructured offers _self-hosted_ deployments, which allow you to run the [Unstructured user interface](/ui/overview) (UI) +and the [Unstructured API](/api-reference/overview) on infrastructure that you maintain in your +[Amazon Web Services (AWS)](/self-hosted/aws/overview), [Azure](/self-hosted/azure/overview), or +[Google Cloud Platform (GCP)](/self-hosted/gcp/overview) account. + +Running Unstructured on your own cloud-based infrastructure provides benefits such as the following: + +- _Security, privacy, and ownership_: Your organization might have strict data security requirements to keep your data and models within a virtual private cloud (VPC). +- _Compliance and data sovereignty_: Certain industries and locales might have regulatory requirements that require data to be processed and for data and models to be stored in specific cloud provider regions. +- _Customization_: Self-hosted deployments allow for more customization and control options over your environments. +- _Testing and development_: Self-hosted deployments can be useful for multi-environment testing and development purposes, allowing you to experiment with Unstructured in a testing environment without affecting your production environment. + +## Getting started + +To get started with a self-hosted deployment, your organization must first sign a self-hosting agreement with Unstructured. +This agreement outlines the terms and conditions for your organization to use Unstructured in a self-hosted environment. + +To begin the self-hosting agreement process, contact your Unstructured sales representative, email Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or fill out the +[contact form](https://unstructured.io/contact) on the Unstructured website. + +After your organization has signed the self-hosting agreement, a member of the Unstructured technical enablement team will reach out to you to begin the +deployment onboarding process. To streamline this process, you are encouraged to begin setting up your target environment as soon as possible. To do this, +see the onboarding requirements for your [AWS](/self-hosted/aws/onboard), [Azure](/self-hosted/azure/onboard), or +[GCP](/self-hosted/gcp/onboard) account. + +## Questions? Need help? + +Contact your Unstructured sales representative, email Unstructured Sales at [sales@unstructured.io](mailto:sales@unstructured.io), or fill out the +[contact form](https://unstructured.io/contact) on the Unstructured website, and a member of the Unstructured sales or support teams +will get back to you as soon as possible. + diff --git a/self-hosted/security-compliance/overview.mdx b/self-hosted/security-compliance/overview.mdx new file mode 100644 index 00000000..1ee41e5f --- /dev/null +++ b/self-hosted/security-compliance/overview.mdx @@ -0,0 +1,75 @@ +--- +title: Security and compliance overview +sidebarTitle: Overview +--- + +This document outlines the security features and compliance posture of the software system. +It is intended to provide customers and stakeholders with a clear understanding of the security +mechanisms in place and the standards to which the system adheres. + +## Encryption + +**In Transit** + +- All inter-service communication within the Kubernetes cluster is encrypted using mutual TLS (mTLS) via Istio service mesh. +- All ingress and egress communication uses TLS 1.2 or higher. + +**At Rest** + +- Data stored in blob storage is encrypted at rest using cloud-native encryption mechanisms (e.g., Azure Storage Service Encryption, AWS S3 Server-Side Encryption). +- Encryption keys are managed by the underlying cloud provider and rotated according to provider best practices. + +## Secrets management + +- Secrets are securely managed using the CSI (Container Storage Interface) driver with support for: + + - Azure Key Vault + - AWS Secrets Manager + - Google Secret Manager + +- Secrets are mounted into pods at runtime and are never stored in plaintext in the cluster. +- Access to secrets is controlled through fine-grained identity and access management (IAM) policies, tied to the workload identity. +- Rotation of secrets is handled externally and reflected immediately via CSI mounts. + +## Authentication and authorization + +- End-user authentication is managed through Keycloak, which supports integration with external identity providers such as Azure Active Directory and Google Identity. +- Authentication follows the OAuth2 and OpenID Connect standards. +- Role-based access control (RBAC) is applied across the system. +- Roles and Permissions: + + - Super Admin + - General Developer + - Billing + +## CI/CD and software supply chain security + +- Continuous integration and delivery (CI/CD) pipelines include: + + - Static code analysis + - Automated testing + - Software Bill of Materials (SBOM) creation + - Vulnerability scanning using Grype + +- All container images are built from hardened, minimal base images and scanned prior to release. + +## Logging and monitoring + +- Logs are emitted to standard output and error streams (`stdout`/`stderr`) following container logging best practices. +- Logs can be collected by pluggable logging agents (e.g., Elastic Agent) deployed into the Kubernetes cluster. +- Centralized logging enables customers to integrate with their own observability solutions and meet internal audit requirements. +- Metrics are emitted to an Otel Collector which can be configured to ship to various observability solutions. + +## Compliance + +The system is designed and operated in accordance with industry-recognized security and data protection standards: + +- **SOC 2 Type II**: Controls are implemented and audited for security, availability, and confidentiality. +- **ISO 27001**: Operational processes align with the ISO 27001 framework for information security management. +- **GDPR**: Data handling practices conform to the General Data Protection Regulation (GDPR) for protection of personal data. +- **FedRAMP**: The system meets the security requirements of the Federal Risk and Authorization Management Program (FedRAMP) for U.S. government systems. + +## Additional protections + +- Each workflow job runs in an isolated, ephemeral Kubernetes namespace to ensure strong runtime separation. +- Namespaced service isolation within the Kubernetes cluster \ No newline at end of file