google/generativeai/embedding.py

# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations

import itertools
from typing import Any, Iterable, overload, TypeVar, Union, Mapping

import google.ai.generativelanguage as glm
from google.generativeai import protos

from google.generativeai.client import get_default_generative_client
from google.generativeai.client import get_default_generative_async_client

from google.generativeai.types import helper_types
from google.generativeai.types import model_types
from google.generativeai.types import text_types
from google.generativeai.types import content_types

DEFAULT_EMB_MODEL = "models/embedding-001"
EMBEDDING_MAX_BATCH_SIZE = 100

EmbeddingTaskType = protos.TaskType

EmbeddingTaskTypeOptions = Union[int, str, EmbeddingTaskType]

_EMBEDDING_TASK_TYPE: dict[EmbeddingTaskTypeOptions, EmbeddingTaskType] = {
    EmbeddingTaskType.TASK_TYPE_UNSPECIFIED: EmbeddingTaskType.TASK_TYPE_UNSPECIFIED,
    0: EmbeddingTaskType.TASK_TYPE_UNSPECIFIED,
    "task_type_unspecified": EmbeddingTaskType.TASK_TYPE_UNSPECIFIED,
    "unspecified": EmbeddingTaskType.TASK_TYPE_UNSPECIFIED,
    EmbeddingTaskType.RETRIEVAL_QUERY: EmbeddingTaskType.RETRIEVAL_QUERY,
    1: EmbeddingTaskType.RETRIEVAL_QUERY,
    "retrieval_query": EmbeddingTaskType.RETRIEVAL_QUERY,
    "query": EmbeddingTaskType.RETRIEVAL_QUERY,
    EmbeddingTaskType.RETRIEVAL_DOCUMENT: EmbeddingTaskType.RETRIEVAL_DOCUMENT,
    2: EmbeddingTaskType.RETRIEVAL_DOCUMENT,
    "retrieval_document": EmbeddingTaskType.RETRIEVAL_DOCUMENT,
    "document": EmbeddingTaskType.RETRIEVAL_DOCUMENT,
    EmbeddingTaskType.SEMANTIC_SIMILARITY: EmbeddingTaskType.SEMANTIC_SIMILARITY,
    3: EmbeddingTaskType.SEMANTIC_SIMILARITY,
    "semantic_similarity": EmbeddingTaskType.SEMANTIC_SIMILARITY,
    "similarity": EmbeddingTaskType.SEMANTIC_SIMILARITY,
    EmbeddingTaskType.CLASSIFICATION: EmbeddingTaskType.CLASSIFICATION,
    4: EmbeddingTaskType.CLASSIFICATION,
    "classification": EmbeddingTaskType.CLASSIFICATION,
    EmbeddingTaskType.CLUSTERING: EmbeddingTaskType.CLUSTERING,
    5: EmbeddingTaskType.CLUSTERING,
    "clustering": EmbeddingTaskType.CLUSTERING,
    6: EmbeddingTaskType.QUESTION_ANSWERING,
    "question_answering": EmbeddingTaskType.QUESTION_ANSWERING,
    "qa": EmbeddingTaskType.QUESTION_ANSWERING,
    EmbeddingTaskType.QUESTION_ANSWERING: EmbeddingTaskType.QUESTION_ANSWERING,
    7: EmbeddingTaskType.FACT_VERIFICATION,
    "fact_verification": EmbeddingTaskType.FACT_VERIFICATION,
    "verification": EmbeddingTaskType.FACT_VERIFICATION,
    EmbeddingTaskType.FACT_VERIFICATION: EmbeddingTaskType.FACT_VERIFICATION,
}


def to_task_type(x: EmbeddingTaskTypeOptions) -> EmbeddingTaskType:
    if isinstance(x, str):
        x = x.lower()
    return _EMBEDDING_TASK_TYPE[x]


try:
    # python 3.12+
    _batched = itertools.batched  # type: ignore
except AttributeError:
    T = TypeVar("T")

    def _batched(iterable: Iterable[T], n: int) -> Iterable[list[T]]:
        if n < 1:
            raise ValueError(
                f"Invalid input: The batch size 'n' must be a positive integer. You entered: {n}. Please enter a number greater than 0."
            )
        batch = []
        for item in iterable:
            batch.append(item)
            if len(batch) == n:
                yield batch
                batch = []

        if batch:
            yield batch


@overload
def embed_content(
    model: model_types.BaseModelNameOptions,
    content: content_types.ContentType,
    task_type: EmbeddingTaskTypeOptions | None = None,
    title: str | None = None,
    output_dimensionality: int | None = None,
    client: glm.GenerativeServiceClient | None = None,
    request_options: helper_types.RequestOptionsType | None = None,
) -> text_types.EmbeddingDict: ...


@overload
def embed_content(
    model: model_types.BaseModelNameOptions,
    content: Iterable[content_types.ContentType],
    task_type: EmbeddingTaskTypeOptions | None = None,
    title: str | None = None,
    output_dimensionality: int | None = None,
    client: glm.GenerativeServiceClient | None = None,
    request_options: helper_types.RequestOptionsType | None = None,
) -> text_types.BatchEmbeddingDict: ...


def embed_content(
    model: model_types.BaseModelNameOptions,
    content: content_types.ContentType | Iterable[content_types.ContentType],
    task_type: EmbeddingTaskTypeOptions | None = None,
    title: str | None = None,
    output_dimensionality: int | None = None,
    client: glm.GenerativeServiceClient = None,
    request_options: helper_types.RequestOptionsType | None = None,
) -> text_types.EmbeddingDict | text_types.BatchEmbeddingDict:
    """Calls the API to create embeddings for content passed in.

    Args:
        model:
            Which [model](https://ai.google.dev/models/gemini#embedding) to
            call, as a string or a `types.Model`.

        content:
            Content to embed.

        task_type:
            Optional task type for which the embeddings will be used. Can only
            be set for `models/embedding-001`.

        title:
            An optional title for the text. Only applicable when task_type is
            `RETRIEVAL_DOCUMENT`.

        output_dimensionality:
            Optional reduced dimensionality for the output embeddings. If set,
            excessive values from the output embeddings will be truncated from
            the end.

        request_options:
            Options for the request.

    Return:
        Dictionary containing the embedding (list of float values) for the
        input content.
    """
    model = model_types.make_model_name(model)

    if request_options is None:
        request_options = {}

    if client is None:
        client = get_default_generative_client()

    if title and to_task_type(task_type) is not EmbeddingTaskType.RETRIEVAL_DOCUMENT:
        raise ValueError(
            f"Invalid task type: When a title is specified, the task must be of a 'retrieval document' type. Received task type: {task_type} and title: {title}."
        )

    if output_dimensionality and output_dimensionality < 0:
        raise ValueError(
            f"Invalid value: `output_dimensionality` must be a non-negative integer. Received: {output_dimensionality}."
        )

    if task_type:
        task_type = to_task_type(task_type)

    if isinstance(content, Iterable) and not isinstance(content, (str, Mapping)):
        result = {"embedding": []}
        requests = (
            protos.EmbedContentRequest(
                model=model,
                content=content_types.to_content(c),
                task_type=task_type,
                title=title,
                output_dimensionality=output_dimensionality,
            )
            for c in content
        )
        for batch in _batched(requests, EMBEDDING_MAX_BATCH_SIZE):
            embedding_request = protos.BatchEmbedContentsRequest(model=model, requests=batch)
            embedding_response = client.batch_embed_contents(
                embedding_request,
                **request_options,
            )
            embedding_dict = type(embedding_response).to_dict(embedding_response)
            result["embedding"].extend(e["values"] for e in embedding_dict["embeddings"])
        return result
    else:
        embedding_request = protos.EmbedContentRequest(
            model=model,
            content=content_types.to_content(content),
            task_type=task_type,
            title=title,
            output_dimensionality=output_dimensionality,
        )
        embedding_response = client.embed_content(
            embedding_request,
            **request_options,
        )
        embedding_dict = type(embedding_response).to_dict(embedding_response)
        embedding_dict["embedding"] = embedding_dict["embedding"]["values"]
        return embedding_dict


@overload
async def embed_content_async(
    model: model_types.BaseModelNameOptions,
    content: content_types.ContentType,
    task_type: EmbeddingTaskTypeOptions | None = None,
    title: str | None = None,
    output_dimensionality: int | None = None,
    client: glm.GenerativeServiceAsyncClient | None = None,
    request_options: helper_types.RequestOptionsType | None = None,
) -> text_types.EmbeddingDict: ...


@overload
async def embed_content_async(
    model: model_types.BaseModelNameOptions,
    content: Iterable[content_types.ContentType],
    task_type: EmbeddingTaskTypeOptions | None = None,
    title: str | None = None,
    output_dimensionality: int | None = None,
    client: glm.GenerativeServiceAsyncClient | None = None,
    request_options: helper_types.RequestOptionsType | None = None,
) -> text_types.BatchEmbeddingDict: ...


async def embed_content_async(
    model: model_types.BaseModelNameOptions,
    content: content_types.ContentType | Iterable[content_types.ContentType],
    task_type: EmbeddingTaskTypeOptions | None = None,
    title: str | None = None,
    output_dimensionality: int | None = None,
    client: glm.GenerativeServiceAsyncClient = None,
    request_options: helper_types.RequestOptionsType | None = None,
) -> text_types.EmbeddingDict | text_types.BatchEmbeddingDict:
    """Calls the API to create async embeddings for content passed in."""

    model = model_types.make_model_name(model)

    if request_options is None:
        request_options = {}

    if client is None:
        client = get_default_generative_async_client()

    if title and to_task_type(task_type) is not EmbeddingTaskType.RETRIEVAL_DOCUMENT:
        raise ValueError(
            f"Invalid task type: When a title is specified, the task must be of a 'retrieval document' type. Received task type: {task_type} and title: {title}."
        )
    if output_dimensionality and output_dimensionality < 0:
        raise ValueError(
            f"Invalid value: `output_dimensionality` must be a non-negative integer. Received: {output_dimensionality}."
        )

    if task_type:
        task_type = to_task_type(task_type)

    if isinstance(content, Iterable) and not isinstance(content, (str, Mapping)):
        result = {"embedding": []}
        requests = (
            protos.EmbedContentRequest(
                model=model,
                content=content_types.to_content(c),
                task_type=task_type,
                title=title,
                output_dimensionality=output_dimensionality,
            )
            for c in content
        )
        for batch in _batched(requests, EMBEDDING_MAX_BATCH_SIZE):
            embedding_request = protos.BatchEmbedContentsRequest(model=model, requests=batch)
            embedding_response = await client.batch_embed_contents(
                embedding_request,
                **request_options,
            )
            embedding_dict = type(embedding_response).to_dict(embedding_response)
            result["embedding"].extend(e["values"] for e in embedding_dict["embeddings"])
        return result
    else:
        embedding_request = protos.EmbedContentRequest(
            model=model,
            content=content_types.to_content(content),
            task_type=task_type,
            title=title,
            output_dimensionality=output_dimensionality,
        )
        embedding_response = await client.embed_content(
            embedding_request,
            **request_options,
        )
        embedding_dict = type(embedding_response).to_dict(embedding_response)
        embedding_dict["embedding"] = embedding_dict["embedding"]["values"]
        return embedding_dict