Unstructured

Document parsing and data extraction API

docs.unstructured.io/api-reference ↗
Version
0.0.1
OpenAPI
3.0.3
Endpoints
1
Schemas
4
Updated
3 days ago
Ai ai document-processing extraction
Use this API in your AI agent

Query structured spec data via REST or MCP. Get exactly what your agent needs.

Get API Key

Server URLs

https://api.unstructured.io
http://localhost:8000

Authentication

apiKey

No endpoints found for this provider.

Schemas

array Elements
{
  "type": "array",
  "items": {
    "Element": {
      "type": "object",
      "properties": {
        "text": {},
        "type": {},
        "metadata": {},
        "element_id": {}
      }
    }
  }
}
object HTTPValidationError
{
  "type": "object",
  "title": "HTTPValidationError",
  "properties": {
    "detail": {
      "type": "array",
      "items": {
        "$ref": "#/components/schemas/ValidationError"
      },
      "title": "Detail"
    }
  }
}
object ValidationError
{
  "type": "object",
  "title": "ValidationError",
  "required": [
    "loc",
    "msg",
    "type"
  ],
  "properties": {
    "loc": {
      "type": "array",
      "items": {
        "oneOf": [
          {
            "type": "string"
          },
          {
            "type": "integer"
          }
        ]
      },
      "title": "Location"
    },
    "msg": {
      "type": "string",
      "title": "Message"
    },
    "type": {
      "type": "string",
      "title": "Error Type"
    }
  }
}
object partition_parameters
{
  "type": "object",
  "title": "Partition Parameters",
  "properties": {
    "files": {
      "type": "string",
      "format": "binary",
      "example": {
        "summary": "File to be partitioned",
        "externalValue": "https://github.com/Unstructured-IO/unstructured/blob/98d3541909f64290b5efb65a226fc3ee8a7cc5ee/example-docs/layout-parser-paper.pdf"
      },
      "required": "true",
      "description": "The file to extract"
    },
    "overlap": {
      "type": "integer",
      "title": "Intra-chunk overlap",
      "example": 25,
      "description": "A prefix of this many trailing characters from the prior text-split chunk is applied to second and later chunks formed from oversized elements by text-splitting. Default: None"
    },
    "encoding": {
      "type": "string",
      "title": "Encoding",
      "example": "utf-8",
      "description": "The encoding method used to decode the text input. Default: utf-8"
    },
    "strategy": {
      "type": "string",
      "title": "Strategy",
      "example": "hi_res",
      "description": "The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto"
    },
    "languages": {
      "type": "array",
      "items": {
        "type": "string",
        "example": "eng"
      },
      "title": "OCR Languages",
      "default": [],
      "example": "[eng]",
      "description": "The languages present in the document, for use in partitioning and/or OCR"
    },
    "coordinates": {
      "type": "boolean",
      "title": "Coordinates",
      "description": "If true, return coordinates for each element. Default: false"
    },
    "overlap_all": {
      "type": "boolean",
      "title": "Inter-chunk overlap",
      "description": "When True, overlap is also applied to 'normal' chunks formed by combining whole elements. Use with caution as this can introduce noise into otherwise clean semantic units. Default: None"
    },
    "output_format": {
      "type": "string",
      "title": "Output Format",
      "example": "application/json",
      "description": "The format of the response. Supported formats are application/json and text/csv. Default: application/json."
    },
    "xml_keep_tags": {
      "type": "boolean",
      "title": "Xml Keep Tags",
      "description": "If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to partition_xml."
    },
    "max_characters": {
      "type": "integer",
      "title": "Max Characters",
      "example": 1500,
      "description": "If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 500"
    },
    "chunking_strategy": {
      "type": "string",
      "title": "Chunking Strategy",
      "example": "by_title",
      "description": "Use one of the supported strategies to chunk the returned elements. Currently supports: by_title"
    },
    "hi_res_model_name": {
      "type": "string",
      "title": "Hi Res Model Name",
      "example": "yolox",
      "description": "The name of the inference model used when strategy is hi_res"
    },
    "new_after_n_chars": {
      "type": "integer",
      "title": "New after n chars",
      "example": 1500,
      "description": "If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: max_characters (off)"
    },
    "multipage_sections": {
      "type": "boolean",
      "title": "Multipage Sections",
      "description": "If chunking strategy is set, determines if sections can span multiple pages. Only applies to by_title chunking strategy.Default: true"
    },
    "unique_element_ids": {
      "type": "boolean",
      "title": "Unique element IDs",
      "description": "When True, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: False"
    },
    "include_page_breaks": {
      "type": "boolean",
      "title": "Include Page Breaks",
      "description": "If True, the output will include page breaks if the filetype supports it. Default: false"
    },
    "combine_under_n_chars": {
      "type": "integer",
      "title": "Combine Under N Chars",
      "example": 500,
      "description": "If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: max_characters"
    },
    "include_orig_elements": {
      "type": "boolean",
      "title": "Original-elements flag",
      "description": "When True (the default), the elements used to form a chunk appear in `.metadata.orig_elements` for that chunk. Only applies when chunking is specified using the `chunking_strategy` argument."
    },
    "skip_infer_table_types": {
      "type": "array",
      "items": {
        "type": "string",
        "example": "pdf"
      },
      "title": "Skip Infer Table Types",
      "description": "The document types that you want to skip table extraction with. Default: []"
    },
    "extract_image_block_types": {
      "type": "array",
      "items": {
        "type": "string",
        "example": "image"
      },
      "title": "Image block types to extract",
      "default": [],
      "example": [
        "image",
        "table"
      ],
      "description": "The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields"
    },
    "pdf_infer_table_structure": {
      "type": "boolean",
      "title": "Pdf Infer Table Structure",
      "description": "Deprecated! Use skip_infer_table_types to opt out of table extraction for any file type. If False and strategy=hi_res, no Table Elements will be extracted from pdf files regardless of skip_infer_table_types contents."
    },
    "gz_uncompressed_content_type": {
      "type": "string",
      "title": "Uncompressed Content Type",
      "example": "application/pdf",
      "description": "If file is gzipped, use this content type after unzipping"
    }
  }
}