Citation Guide

Guide: Implementing Citations with Perplexity and GCP Gemini Model

This guide demonstrates how to implement and utilize citation using both Perplexity and GCP Gemini through GIP.
We'll explore key features, implementation patterns, and use cases for each system.

Overview of Citation Systems

Perplexity Citations

Simple URL-based citation system that returns source links for generated content.

GCP Gemini Grounding

Comprehensive system providing detailed metadata about sources and their relevance.

Environment Setup

Before using Perplexity through GIP, set the required environment variables:

export GIP_URL=https://dev-api.platform.a15t.com/v1
export GIP_API_KEY=sk-gapk-...

Implementation Examples

1. Perplexity Citation Implementation

Basic Response (Completed)

import asyncio
import os
from openai import AsyncOpenAI
from openai.types.chat import ChatCompletionMessageParam

async def request_pplx(client: AsyncOpenAI, query: str):
    model = "perplexity/sonar"
    messages: list[ChatCompletionMessageParam] = [{
        "role": "system",
        "content": """You are a helpful AI assistant.

Rules:
1. Provide only the final answer. It is important that you do not include any explanation on the steps below.
2. Do not show the intermediate steps information.

Steps:
1. Decide if the answer should be a brief sentence or a list of suggestions.
2. If it is a list of suggestions, first, write a brief and natural introduction based on the original query.
3. Followed by a list of suggestions, each suggestion should be split by two newlines."""
    },
    {
        "role": "user",
        "content": query
    }]

    response = await client.chat.completions.create(
        model=model,
        messages=messages,
    )

    print(response.choices[0].message.content)

if __name__ == "__main__":
    client = AsyncOpenAI(
        base_url=os.getenv("GIP_URL", ""),
        api_key=os.getenv("GIP_API_KEY", ""),
    )
    asyncio.run(request_pplx(client, "What are some good books to read?"))

Streaming Response

import asyncio
import os
from openai import AsyncOpenAI
from openai.types.chat import ChatCompletionMessageParam

async def request_pplx(client: AsyncOpenAI, query: str):
    model = "perplexity/sonar"
    messages: list[ChatCompletionMessageParam] = [{
        "role": "system",
        "content": """You are a helpful AI assistant.

Rules:
1. Provide only the final answer. It is important that you do not include any explanation on the steps below.
2. Do not show the intermediate steps information.

Steps:
1. Decide if the answer should be a brief sentence or a list of suggestions.
2. If it is a list of suggestions, first, write a brief and natural introduction based on the original query.
3. Followed by a list of suggestions, each suggestion should be split by two newlines."""
    },
    {
        "role": "user",
        "content": query
    }]

    stream = await client.chat.completions.create(
        model=model,
        messages=messages,
        stream=True
    )

    async for chunk in stream:
        if chunk.choices[0].delta.content:
            print(chunk.choices[0].delta.content)

if __name__ == "__main__":
    client = AsyncOpenAI(
        base_url=os.getenv("GIP_URL", ""),
        api_key=os.getenv("GIP_API_KEY", ""),
    )
    asyncio.run(request_pplx(client, "What are some good books to read?"))

2. GCP Gemini Grounding Implementation

Basic setup and usage(Gemini < 2.0):

def get_vertex_response(query: str):
    response = client.chat.completions.create(
        model="gcp/gemini-1.5-pro-002",
        messages=[{"role": "user", "content": query}],
        extra_body={
            "model_extensions": {
                "provider": "gcp",
                "google_search_retrieval": {
                    "dynamic_retrieval_config": {
                        "dynamic_threshold": 0.7
                    }
                }
            }
        }
    )
    
    content = response.choices[0].message.content
    grounding = response.model_extensions.get("grounding_metadata", {})
    
    return content, grounding

Example response structure(Gemini < 2.0):

{
    "content": "Answer about quantum computing...",
    "grounding_metadata": {
        "grounding_chunks": [
            {
                "web": {
                    "title": "Research Article",
                    "uri": "https://source.com"
                }
            }
        ],
        "grounding_supports": [
            {
                "grounding_chunk_indices": [0],
                "confidence_scores": [0.95],
                "segment": {
                    "text": "Referenced text portion"
                }
            }
        ]
    }
}

Basic setup and usage(Gemini >= 2.0):

def get_vertex_response(query: str):
    response = client.chat.completions.create(
        model="gcp/gemini-2.0-flash-001",
        messages=[{"role": "user", "content": query}],
        extra_body={
            "model_extensions": {
                "provider": "gcp",
                "google_search": {}
            }
        }
    )
    
    content = response.choices[0].message.content
    grounding = response.model_extensions.get("grounding_metadata", {})
    
    return content, grounding

Example response structure(Gemini >= 2.0):

{
    "model_extensions": {
        "grounding_metadata": {
            "web_search_queries": [
                "who won world series 2020"
            ],
            "search_entry_point": {
                "rendered_content": "<style>\n.c...n",
                "sdk_blob": ""
            },
            "retrieval_queries": [],
            "grounding_chunks": [
                {
                    "web": {
                        "uri": "https://vertexaisearch.cloud.google.com/grounding-api-redirect/...",
                        "title": "latimes.com"
                    }
                },
                ...
            ],
            "grounding_supports": [
                {
                    "segment": {
                        "part_index": 0,
                        "start_index": 0,
                        "end_index": 88,
                        "text": "In 2020, the **Los Angeles Dodgers** won the World Series, defeating the Tampa Bay Rays."
                    },
                    "grounding_chunk_indices": [
                        0,
                        1,
                        2,
                        3,
                        4,
                        5
                    ],
                    "confidence_scores": [
                        0.95567983,
                        0.93321663,
                        0.9251957,
                        0.8089572,
                        0.95722926,
                        0.8422677
                    ]
                },
                ...
            ]
        }
    }
}

Key Features Comparison

Citation Return

To get citations with your response, add the return_citations: True option:

import asyncio
import json
import os
from openai import AsyncOpenAI
from openai.types.chat import ChatCompletionMessageParam

async def request_pplx(client: AsyncOpenAI, query: str):
    model = "perplexity/sonar"
    messages: list[ChatCompletionMessageParam] = [{
        "role": "system",
        "content": """You are a helpful AI assistant.

Rules:
1. Provide only the final answer. It is important that you do not include any explanation on the steps below.
2. Do not show the intermediate steps information.

Steps:
1. Decide if the answer should be a brief sentence or a list of suggestions.
2. If it is a list of suggestions, first, write a brief and natural introduction based on the original query.
3. Followed by a list of suggestions, each suggestion should be split by two newlines."""
    },
    {
        "role": "user",
        "content": query
    }]

    model_extensions = {
        "return_citations": True,
    }

    response = await client.chat.completions.create(
        model=model,
        messages=messages,
        extra_body={"model_extensions": model_extensions}
    )

    if response.model_extra and "model_extensions" in response.model_extra:
        print(json.dumps(response.model_extra["model_extensions"]["citations"], ensure_ascii=False, indent=2))

if __name__ == "__main__":
    client = AsyncOpenAI(
        base_url=os.getenv("GIP_URL", ""),
        api_key=os.getenv("GIP_API_KEY", ""),
    )
    asyncio.run(request_pplx(client, "What are some good books to read?"))

Example citations output:

[
  "https://brooklinebooksmith.com/list/booksmith-top-100-2025",
  "https://brooklinebooksmith.com/collections/our-top-100-books",
  "https://www.penguinrandomhouse.com/the-read-down/the-best-books-of-2025/",
  "https://www.goodreads.com/list/show/264.Books_That_Everyone_Should_Read_At_Least_Once",
  "https://www.ereadersforum.com/threads/10-best-books-of-fall-2025-most-anticipated-new-releases-to-read.8377/"
]

Important Note: The citation indices in the model's response (e.g., [1], [2]) correspond to the index positions in the citations array.

Search Parameters

You can enhance search accuracy using geographic and contextual parameters:

import asyncio
import json
import os
from typing import TypedDict
from openai import AsyncOpenAI
from openai.types.chat import ChatCompletionMessageParam

class PlaceParams(TypedDict):
    store_name: str
    address: str
    country: str
    region: str
    city: str
    latitude: float
    longitude: float

async def request_pplx_with_search_params(
    client: AsyncOpenAI, 
    store_name: str, 
    address: str, 
    country: str, 
    region: str, 
    city: str, 
    latitude: float, 
    longitude: float
):
    model = "perplexity/sonar"
    messages: list[ChatCompletionMessageParam] = [{
        "role": "system",
        "content": """You are an AI Assistant that analyzes information about a requested place and provides a summary of its main features.

Rules:
1. Provide only the final answer. Do not include any explanation of the steps below.
2. Do not show any intermediate step information.
3. Always respond in **Korean** using polite form (존댓말: ~입니다, ~합니다).

Steps:
1. Search for reviews, visit records, and informational articles related to the <subject of analysis>.
2. Only use search results that focus exclusively on the target restaurant. Exclude sources that review multiple restaurants in one article or blog post.
3. Verify that all information extracted is specifically about the target restaurant mentioned in <subject of analysis>.
4. When writing summaries, ensure each sentence is within 60 characters and covers only one topic per sentence."""
    },
    {
        "role": "user",
        "content": f"<subject of analysis>\n매장명 : {store_name}\n주소: {address}\n</subject of analysis>"
    }]

    model_extensions = {
        "return_citations": True,
        "web_search_options": {
            "country": country,
            "region": region,
            "city": city,
            "latitude": latitude,
            "longitude": longitude,
            "search_context_size": "medium"
        }
    }

    response = await client.chat.completions.create(
        model=model,
        messages=messages,
        extra_body={"model_extensions": model_extensions}
    )
    
    if response.choices[0].message.content:
        print(response.choices[0].message.content)
    if response.model_extra and "model_extensions" in response.model_extra:
        citations = response.model_extra["model_extensions"]["citations"]
        print("\nCitations:")
        for i, citation in enumerate(citations, 1):
            print(f"[{i}] {citation}")

if __name__ == "__main__":
    client = AsyncOpenAI(
        base_url=os.getenv("GIP_URL", ""),
        api_key=os.getenv("GIP_API_KEY", ""),
    )
    
    kwargs: PlaceParams = {
        "store_name": "대찬낙지",
        "address": '"경기 성남시 분당구 이매동" 또는 "성남대로"',
        "country": "KR",
        "region": "Gyeonggi-do",
        "city": "Seongnam-si",
        "latitude": 37.3988,
        "longitude": 127.1297,
    }
    
    asyncio.run(request_pplx_with_search_params(client, **kwargs))

Search Parameters Examples:

  • country: Country code (e.g., "KR", "US")
  • region: Region or state
  • city: City name
  • latitude/longitude: Geographic coordinates
  • search_context_size: "low", "medium", or "high"

Additional Search Parameters:

GCP Gemini Grounding

  • Configuration Options:
    • dynamic_threshold: Control source relevance threshold
  • Response Processing:
def process_vertex_grounding(response):
    grounding = response.model_extensions["grounding_metadata"]
    sources = []
    
    for chunk in grounding.get("grounding_chunks", []):
        if "web" in chunk:
            sources.append({
                "title": chunk["web"].get("title"),
                "url": chunk["web"].get("uri"),
                "confidence": next(
                    (support["confidence_scores"][i]
                    for support in grounding.get("grounding_supports", [])
                    for i, idx in enumerate(support["grounding_chunk_indices"])
                    if idx == len(sources)),
                    None
                )
            })
    return sources

Use Case Recommendations

Choose Perplexity When:

  • Simple citation tracking is needed
  • Quick implementation is priority
  • Basic source linking is sufficient
  • URL-based citations are acceptable
  • Structured output formatting is required
  • Geographic search parameters are needed

Choose GCP Gemini When:

  • Detailed source verification is required
  • Confidence scoring is important
  • Text-to-source mapping is needed
  • Advanced grounding features are necessary

Best Practices

Perplexity Best Practices

  • Use AsyncOpenAI for better performance with async/await patterns
  • Set appropriate environment variables for GIP URL and API key
  • Combine return_citations: True with structured output for comprehensive responses
  • Use search parameters for location-specific queries
  • Never ask for URLs in prompts as they may be hallucinated

GCP Gemini Best Practices

  • Choose appropriate model versions (< 2.0 vs >= 2.0) based on your needs
  • Use confidence scores to filter reliable sources
  • Process grounding metadata to extract meaningful citations

Structured Output

You can use JSON schema to specify the output format (Structured Output). It's recommended to use Pydantic models for easier JSON schema creation.

import asyncio
import json
import os
from openai import AsyncOpenAI
from openai.types.chat import ChatCompletionMessageParam
from pydantic import BaseModel, Field

class RecommendedBooks(BaseModel):
    """A model representing a list of recommended books."""
    recommended_books: list["RecommendedBook"] = Field(..., description="A list of recommended books.")

class RecommendedBook(BaseModel):
    """A model representing a recommended book with its details."""
    book_title: str = Field(..., description="The title of the recommended book.")
    book_author: str = Field(..., description="The author of the recommended book.")
    book_description: str = Field(..., description="A brief description of the recommended book.")
    book_genre: str | None = Field(..., description="The genre of the recommended book. Set to None if the genre is unknown.")
    referenced_source_index: int = Field(..., description="The index of the source from which the book was referenced.")

async def request_pplx(client: AsyncOpenAI, query: str):
    model = "perplexity/sonar"
    messages: list[ChatCompletionMessageParam] = [{
        "role": "system",
        "content": """You are a helpful AI assistant.

Rules:
1. Provide only the final answer. It is important that you do not include any explanation on the steps below.
2. Do not show the intermediate steps information.

Steps:
1. Decide if the answer should be a brief sentence or a list of suggestions.
2. If it is a list of suggestions, first, write a brief and natural introduction based on the original query.
3. Followed by a list of suggestions, each suggestion should be split by two newlines."""
    },
    {
        "role": "user",
        "content": query
    }]

    model_extensions = {
        "return_citations": True,
    }

    response = await client.chat.completions.create(
        model=model,
        messages=messages,
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "recommended_books",
                "strict": True,
                "schema": RecommendedBooks.model_json_schema()
            }
        },
        extra_body={"model_extensions": model_extensions}
    )
    
    if response.choices[0].message.content:
        print(json.dumps(json.loads(response.choices[0].message.content), ensure_ascii=False, indent=2))
    if response.model_extra and "model_extensions" in response.model_extra:
        print(json.dumps(response.model_extra["model_extensions"]["citations"], ensure_ascii=False, indent=2))

if __name__ == "__main__":
    client = AsyncOpenAI(
        base_url=os.getenv("GIP_URL", ""),
        api_key=os.getenv("GIP_API_KEY", ""),
    )
    asyncio.run(request_pplx(client, "What are some good books to read?"))

Note: Recursive JSON schema is not supported. If recursive schemas are unavoidable, consider using Perplexity's results as input to an OpenAI model for Structured Output.

Additional Resources