From e66d576659198c833c75cf677a518311fac67d39 Mon Sep 17 00:00:00 2001 From: Karl Erickson <1775795+KarlErickson@users.noreply.github.com> Date: Wed, 14 Jan 2026 16:25:35 -0800 Subject: [PATCH 1/3] Java vector search sample --- .gitignore | 5 + ai/vector-search-java/README.md | 330 ++++++++++++++++++ ai/vector-search-java/pom.xml | 44 +++ .../azure/documentdb/samples/AppConfig.java | 48 +++ .../com/azure/documentdb/samples/DiskAnn.java | 217 ++++++++++++ .../com/azure/documentdb/samples/HNSW.java | 219 ++++++++++++ .../azure/documentdb/samples/HotelData.java | 31 ++ .../com/azure/documentdb/samples/IVF.java | 218 ++++++++++++ .../src/main/resources/application.properties | 14 + 9 files changed, 1126 insertions(+) create mode 100644 ai/vector-search-java/README.md create mode 100644 ai/vector-search-java/pom.xml create mode 100644 ai/vector-search-java/src/main/java/com/azure/documentdb/samples/AppConfig.java create mode 100644 ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java create mode 100644 ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java create mode 100644 ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HotelData.java create mode 100644 ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java create mode 100644 ai/vector-search-java/src/main/resources/application.properties diff --git a/.gitignore b/.gitignore index 65b1ad6..9f22943 100644 --- a/.gitignore +++ b/.gitignore @@ -485,3 +485,8 @@ dist/ *.user *.suo *.sln.docstates + +# Java +*.class +*.jar +target/ diff --git a/ai/vector-search-java/README.md b/ai/vector-search-java/README.md new file mode 100644 index 0000000..8628125 --- /dev/null +++ b/ai/vector-search-java/README.md @@ -0,0 +1,330 @@ +# Azure DocumentDB Vector Samples (Java) + +This project demonstrates vector search capabilities using Azure DocumentDB with Java. It includes implementations of three different vector index types: DiskANN, HNSW, and IVF, along with helper methods for embedding generation and data management. + +## Overview + +Vector search enables semantic similarity searching by converting text into high-dimensional vector representations (embeddings) and finding the most similar vectors in the database. This project shows how to: + +- Generate embeddings using Azure OpenAI +- Store vectors in Azure DocumentDB +- Create and use different types of vector indexes +- Perform similarity searches with various algorithms +- Handle authentication using Azure Active Directory (passwordless) or connection strings + +## Prerequisites + +Before running this project, you need: + +### Azure resources +1. **Azure subscription** with appropriate permissions +2. **Azure OpenAI resource** with embedding model deployment +3. **Azure DocumentDB resource** +4. **Azure CLI** installed and configured + +### Development environment + +- [Java 21 or higher](https://learn.microsoft.com/en-us/java/openjdk/download) +- [Maven 3.6 or higher](https://maven.apache.org/download.cgi) +- [Git](https://git-scm.com/downloads) (for cloning the repository) +- [Visual Studio Code](https://code.visualstudio.com/) (recommended) or another Java IDE + +## Setup instructions + +### Step 1: Clone and setup project + +```bash +# Clone this repository +git clone https://github.com/Azure-Samples/cosmos-db-vector-samples.git +cd cosmos-db-vector-samples/mongo-vcore-vector-search-java + +# Compile the project +mvn clean compile +``` + +### Step 2: Create Azure resources + +#### Create Azure OpenAI resource + +```bash +# Login to Azure +az login + +# Create resource group (if needed) +az group create --name --location + +# Create Azure OpenAI resource +az cognitiveservices account create \ + --name \ + --resource-group \ + --location \ + --kind OpenAI \ + --sku S0 \ + --subscription +``` + +#### Deploy embedding model + +1. Go to Azure OpenAI Studio (https://oai.azure.com/) +2. Navigate to your OpenAI resource +3. Go to **Model deployments** and create a new deployment +4. Choose **text-embedding-ada-002** model +5. Note the deployment name for configuration + +#### Create Azure DocumentDB resource + +Create a Azure DocumentDB cluster by using the [Azure portal](https://learn.microsoft.com/azure/documentdb/quickstart-portal), [Bicep](https://learn.microsoft.com/azure/documentdb/quickstart-bicep), or [Terraform](https://learn.microsoft.com/azure/documentdb/quickstart-terraform). + +### Step 3: Get your resource information + +#### Azure OpenAI endpoint + +```bash +# Get OpenAI endpoint +az cognitiveservices account show \ + --name \ + --resource-group \ + --query "properties.endpoint" --output tsv +``` + +#### DocumentDB cluster name + +You'll need your DocumentDB cluster name (e.g., `my-cluster`), which you can find in the Azure portal or retrieve using: + +```bash +# List DocumentDB clusters in your resource group +az resource list \ + --resource-group "" \ + --resource-type "Microsoft.DocumentDB/mongoClusters" \ + --query "[].name" --output tsv +``` + +### Step 4: Configure application properties + +Edit the `src/main/resources/application.properties` file with your Azure resource information: + +```properties +# Azure OpenAI Embedding Settings +AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-ada-002 +AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15 +AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com/ +EMBEDDING_SIZE_BATCH=16 + +# MongoDB configuration +MONGO_CLUSTER_NAME= + +# Data file +DATA_FILE_WITH_VECTORS=../data/HotelsData_toCosmosDB_Vector.json +EMBEDDED_FIELD=text_embedding_ada_002 +EMBEDDING_DIMENSIONS=1536 +LOAD_SIZE_BATCH=100 +``` + +Alternatively, you can set these as environment variables which will take precedence over the properties file. + +### Step 5: Configure passwordless authentication + +This sample uses passwordless authentication with Microsoft Entra ID for both Azure OpenAI and DocumentDB. Follow these steps to configure it: + +#### For Azure OpenAI + +Assign your Microsoft Entra ID user the following role on the Azure OpenAI resource: + - **Cognitive Services OpenAI User** (or **Cognitive Services OpenAI Contributor** for broader permissions) + +#### For Azure DocumentDB + +1. In your Azure DocumentDB resource, enable **Native DocumentDB and Microsoft Entra ID** authentication methods. +2. Assign your Microsoft Entra ID user the following roles on the DocumentDB resource: + - **Cosmos DB Account Reader Role** + - **DocumentDB Account Contributor** + +## Usage + +The project includes several Java classes that demonstrate different aspects of vector search: + +### 1. DiskANN vector search + +Run DiskANN (Disk-based Approximate Nearest Neighbor) search: + +```bash +mvn compile exec:java -Dexec.mainClass="com.azure.documentdb.samples.DiskAnn" +``` + +DiskANN is optimized for: +- Large datasets that don't fit in memory +- Efficient disk-based storage +- Good balance of speed and accuracy + +### 2. HNSW vector search + +Run HNSW (Hierarchical Navigable Small World) search: + +```bash +mvn compile exec:java -Dexec.mainClass="com.azure.documentdb.samples.HNSW" +``` + +HNSW provides: +- Excellent search performance +- High recall rates +- Hierarchical graph structure +- Good for real-time applications + +### 3. IVF vector search + +Run IVF (Inverted File) search: + +```bash +mvn compile exec:java -Dexec.mainClass="com.azure.documentdb.samples.IVF" +``` + +IVF features: +- Clusters vectors by similarity +- Fast search through cluster centroids +- Configurable accuracy vs speed trade-offs +- Efficient for large vector datasets + +## Project structure + +``` +mongo-vcore-vector-search-java/ +├── pom.xml # Maven project configuration +├── src/ +│ └── main/ +│ ├── java/ +│ │ └── com/azure/documentdb/samples/ +│ │ ├── AppConfig.java # Configuration management +│ │ ├── DiskAnn.java # DiskANN vector search implementation +│ │ ├── HNSW.java # HNSW vector search implementation +│ │ ├── IVF.java # IVF vector search implementation +│ │ └── HotelData.java # Hotel data model +│ └── resources/ +│ └── application.properties # Configuration settings +└── data/ # Hotel data files with vectors +``` + +## Important notes + +### Vector index limitations + +**One Index Per Field**: Azure DocumentDB allows only one vector index per field. Each sample automatically handles this by: + +1. **Dropping existing collections**: Before creating a new vector index, each sample drops and recreates the collection +2. **Safe switching**: You can run different vector index samples in any order - each will create a fresh collection with the appropriate index + +```bash +# Example: Switch between different vector index types +mvn compile exec:java -Dexec.mainClass="com.azure.documentdb.samples.DiskAnn" # Creates DiskANN index +mvn compile exec:java -Dexec.mainClass="com.azure.documentdb.samples.HNSW" # Creates HNSW index +mvn compile exec:java -Dexec.mainClass="com.azure.documentdb.samples.IVF" # Creates IVF index +``` + +**What this means**: +- You cannot have both DiskANN and HNSW indexes simultaneously on the same field +- Each run creates a new collection with fresh data and the appropriate vector index +- No manual cleanup required + +### Cluster tier requirements + +Different vector index types require different cluster tiers: + +- **IVF**: Available on most tiers (including basic) +- **HNSW**: Requires standard tier or higher +- **DiskANN**: Requires premium/high-performance tier. Available on M30 and above + +If you encounter "not enabled for this cluster tier" errors: +1. Try a different index type (IVF is most widely supported) +2. Consider upgrading your cluster tier +3. Check the [Azure DocumentDB pricing page](https://azure.microsoft.com/pricing/details/documentdb/) for tier features + +## Key features + +### Vector index types + +- **DiskANN**: Optimized for large datasets with disk-based storage +- **HNSW**: High-performance hierarchical graph structure +- **IVF**: Clustering-based approach with configurable accuracy + +### Authentication + +- Passwordless authentication with Microsoft Entra ID using DefaultAzureCredential +- Azure AD authentication and RBAC for enhanced security +- Automatic token rotation and renewal + +### Sample data + +- Real hotel dataset with descriptions, locations, and amenities +- Pre-configured for embedding generation +- Includes various hotel types and price ranges + +## Troubleshooting + +### Common issues + +1. **Authentication Errors** + - Ensure Azure CLI is logged in: `az login` + - Verify you have proper RBAC permissions on Azure DocumentDB resource + - Check that Microsoft Entra ID is enabled for your DocumentDB resource + - Verify you have **Cosmos DB Account Reader Role** and **DocumentDB Account Contributor** roles assigned. Roles may take some time to propagate. + +2. **Embedding Generation Fails** + - Check Azure OpenAI model deployment name + - Verify API version compatibility + - Ensure API endpoint is accessible + +3. **Vector Search Returns No Results** + - Ensure data was inserted into collection successfully + - Verify vector indexes are built properly + - Check that embeddings match the expected dimensions + +4. **Compilation Issues** + - Verify Java 21 or higher is installed: `java -version` + - Verify Maven is installed: `mvn -version` + - Run `mvn clean install` to rebuild the project + +5. **Connection Issues** + - Ensure firewall rules allow your IP address + - Check that the cluster is running + - Verify `MONGO_CLUSTER_NAME` is set correctly + +## Performance considerations + +### Choosing vector index types + +- **Use DiskANN when**: Dataset is very large, memory is limited, vector count is up to 500,000+ +- **Use HNSW when**: Need fastest search, have sufficient memory, vector count is up to 50,000 +- **Use IVF when**: Want configurable accuracy/speed trade-offs, vector count is under 10,000 + +### Tuning parameters + +- **Batch sizes**: Adjust `LOAD_SIZE_BATCH` and `EMBEDDING_SIZE_BATCH` based on API rate limits and memory +- **Vector dimensions**: Must match your embedding model (1536 for text-embedding-ada-002) +- **Index parameters**: Tune for your specific accuracy/speed requirements + +### Cost optimization + +- Use appropriate Azure OpenAI pricing tier +- Monitor API usage and optimize batch processing + +## Further resources + +- [Azure DocumentDB Documentation](https://learn.microsoft.com/azure/documentdb/) +- [Azure OpenAI Service Documentation](https://learn.microsoft.com/azure/cognitive-services/openai/) +- [Vector Search in Azure DocumentDB](https://learn.microsoft.com/azure/cosmos-db/vector-database) +- [MongoDB Java Driver Documentation](https://mongodb.github.io/mongo-java-driver/) +- [Azure SDK for Java Documentation](https://learn.microsoft.com/java/api/overview/azure/) + +## Support + +If you encounter issues: +1. Check the troubleshooting section above +2. Review Azure resource configurations +3. Verify environment variable settings +4. Check Azure service status and quotas + +## License + +This project is licensed under the MIT License - see the [LICENSE](../LICENSE.md) file for details. + +## Contributing + +Contributions are welcome! Please see [CONTRIBUTING](../CONTRIBUTING.md) for details. diff --git a/ai/vector-search-java/pom.xml b/ai/vector-search-java/pom.xml new file mode 100644 index 0000000..30a04be --- /dev/null +++ b/ai/vector-search-java/pom.xml @@ -0,0 +1,44 @@ + + 4.0.0 + + com.azure.documentdb.samples + vector-search-quickstart + 1.0-SNAPSHOT + + + 21 + 21 + UTF-8 + + + + + org.mongodb + mongodb-driver-sync + 5.6.1 + + + com.azure + azure-identity + 1.11.4 + + + com.azure + azure-ai-openai + 1.0.0-beta.8 + + + com.fasterxml.jackson.core + jackson-databind + 2.15.3 + + + org.slf4j + slf4j-nop + 2.0.17 + runtime + + + diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/AppConfig.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/AppConfig.java new file mode 100644 index 0000000..bef1597 --- /dev/null +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/AppConfig.java @@ -0,0 +1,48 @@ +package com.azure.documentdb.samples; + +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; + +/** + * Application configuration loaded from environment variables and application.properties file. + */ +public class AppConfig { + private final Map config = new HashMap<>(); + + public AppConfig() { + loadFromEnvironment(); + loadFromPropertiesFile(); + } + + private void loadFromEnvironment() { + System.getenv().forEach(config::put); + } + + private void loadFromPropertiesFile() { + try (InputStream input = getClass().getClassLoader().getResourceAsStream("application.properties")) { + if (input != null) { + Properties properties = new Properties(); + properties.load(input); + properties.forEach((key, value) -> config.putIfAbsent(key.toString(), value.toString())); + } + } catch (IOException e) { + System.err.println("Warning: Could not read application.properties file: " + e.getMessage()); + } + } + + public String get(String key) { + return config.get(key); + } + + public String getOrDefault(String key, String defaultValue) { + return config.getOrDefault(key, defaultValue); + } + + public int getIntOrDefault(String key, int defaultValue) { + var value = config.get(key); + return value != null ? Integer.parseInt(value) : defaultValue; + } +} diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java new file mode 100644 index 0000000..322ad7c --- /dev/null +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java @@ -0,0 +1,217 @@ +package com.azure.documentdb.samples; + +import com.azure.ai.openai.OpenAIClient; +import com.azure.ai.openai.OpenAIClientBuilder; +import com.azure.ai.openai.models.EmbeddingsOptions; +import com.azure.identity.DefaultAzureCredentialBuilder; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoClients; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; +import com.mongodb.client.AggregateIterable; +import com.mongodb.client.model.Indexes; +import com.mongodb.MongoClientSettings; +import com.mongodb.MongoCredential; +import com.mongodb.ConnectionString; +import org.bson.Document; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +/** + * Vector search application using DiskANN index. + */ +public class DiskAnn { + private static final String SAMPLE_QUERY = "What are some hotels with good accessibility?"; + private static final String DATABASE_NAME = "travel"; + private static final String COLLECTION_NAME = "hotels_diskann"; + private static final String VECTOR_INDEX_NAME = "vectorIndex_diskann"; + + private final AppConfig config = new AppConfig(); + private final ObjectMapper objectMapper = new ObjectMapper(); + + public static void main(String[] args) { + new DiskAnn().run(); + System.exit(0); + } + + public void run() { + try (var mongoClient = createMongoClient()) { + var openAIClient = createOpenAIClient(); + + var database = mongoClient.getDatabase(DATABASE_NAME); + var collection = database.getCollection(COLLECTION_NAME, Document.class); + + // Drop and recreate collection + collection.drop(); + database.createCollection(COLLECTION_NAME); + System.out.println("Created collection: " + COLLECTION_NAME); + + // Load and insert data + var hotelData = loadHotelData(); + insertDataInBatches(collection, hotelData); + + // Create standard indexes + collection.createIndex(Indexes.ascending("HotelName")); + collection.createIndex(Indexes.ascending("Category")); + + // Create vector index + createVectorIndex(database); + + // Perform vector search + var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); + performVectorSearch(collection, queryEmbedding); + + } catch (Exception e) { + System.err.println("Error: " + e.getMessage()); + e.printStackTrace(); + } + } + + private MongoClient createMongoClient() { + var clusterName = config.get("MONGO_CLUSTER_NAME"); + var azureCredential = new DefaultAzureCredentialBuilder().build(); + + MongoCredential.OidcCallback callback = (MongoCredential.OidcCallbackContext context) -> { + var token = azureCredential.getToken( + new com.azure.core.credential.TokenRequestContext() + .addScopes("https://ossrdbms-aad.database.windows.net/.default") + ).block(); + + if (token == null) { + throw new RuntimeException("Failed to obtain Azure AD token"); + } + + return new MongoCredential.OidcCallbackResult(token.getToken()); + }; + + var credential = MongoCredential.createOidcCredential(null) + .withMechanismProperty("OIDC_CALLBACK", callback); + + var connectionString = new ConnectionString( + String.format("mongodb+srv://%s.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", clusterName) + ); + + var settings = MongoClientSettings.builder() + .applyConnectionString(connectionString) + .credential(credential) + .build(); + + return MongoClients.create(settings); + } + + private OpenAIClient createOpenAIClient() { + var endpoint = config.get("AZURE_OPENAI_EMBEDDING_ENDPOINT"); + var credential = new DefaultAzureCredentialBuilder().build(); + + return new OpenAIClientBuilder() + .endpoint(endpoint) + .credential(credential) + .buildClient(); + } + + private List loadHotelData() throws IOException { + var dataFile = config.getOrDefault("DATA_FILE_WITH_VECTORS", "HotelsData_toCosmosDB_Vector.json"); + var filePath = Path.of(dataFile); + + System.out.println("Reading JSON file from " + filePath.toAbsolutePath()); + var jsonContent = Files.readString(filePath); + + return objectMapper.readValue(jsonContent, new TypeReference>() {}); + } + + private void insertDataInBatches(MongoCollection collection, List hotelData) { + var batchSize = config.getIntOrDefault("LOAD_SIZE_BATCH", 100); + var batches = partitionList(hotelData, batchSize); + + System.out.println("Processing in batches of " + batchSize + "..."); + + for (int i = 0; i < batches.size(); i++) { + var batch = batches.get(i); + var documents = batch.stream() + .map(this::convertToDocument) + .toList(); + + collection.insertMany(documents); + System.out.println("Batch " + (i + 1) + " complete: " + documents.size() + " inserted"); + } + } + + private Document convertToDocument(HotelData hotel) { + try { + var json = objectMapper.writeValueAsString(hotel); + return Document.parse(json); + } catch (Exception e) { + throw new RuntimeException("Failed to convert hotel to document", e); + } + } + + private void createVectorIndex(MongoDatabase database) { + var indexDefinition = new Document() + .append("createIndexes", COLLECTION_NAME) + .append("indexes", List.of( + new Document() + .append("name", VECTOR_INDEX_NAME) + .append("key", new Document("text_embedding_ada_002", "cosmosSearch")) + .append("cosmosSearchOptions", new Document() + .append("kind", "vector-diskann") + .append("dimensions", config.getIntOrDefault("EMBEDDING_DIMENSIONS", 1536)) + .append("similarity", "COS") + ) + )); + + database.runCommand(indexDefinition); + System.out.println("Created vector index: " + VECTOR_INDEX_NAME); + } + + private List createEmbedding(OpenAIClient openAIClient, String text) { + var model = config.getOrDefault("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"); + var options = new EmbeddingsOptions(List.of(text)); + + var response = openAIClient.getEmbeddings(model, options); + return response.getData().get(0).getEmbedding().stream() + .map(Float::doubleValue) + .toList(); + } + + private void performVectorSearch(MongoCollection collection, List queryEmbedding) { + var searchStage = new Document("$search", new Document() + .append("cosmosSearch", new Document() + .append("vector", queryEmbedding) + .append("path", "text_embedding_ada_002") + .append("k", 5) + ) + ); + + var projectStage = new Document("$project", new Document() + .append("HotelName", 1) + .append("score", new Document("$meta", "searchScore")) + ); + + var pipeline = List.of(searchStage, projectStage); + + System.out.println("\nVector search results for: \"" + SAMPLE_QUERY + "\""); + + AggregateIterable results = collection.aggregate(pipeline); + var rank = 1; + + for (var result : results) { + var hotelName = result.getString("HotelName"); + var score = result.getDouble("score"); + System.out.printf("%d. HotelName: %s, Score: %.4f%n", rank++, hotelName, score); + } + } + + private static List> partitionList(List list, int batchSize) { + var partitions = new ArrayList>(); + for (int i = 0; i < list.size(); i += batchSize) { + partitions.add(list.subList(i, Math.min(i + batchSize, list.size()))); + } + return partitions; + } +} diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java new file mode 100644 index 0000000..ca580f3 --- /dev/null +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java @@ -0,0 +1,219 @@ +package com.azure.documentdb.samples; + +import com.azure.ai.openai.OpenAIClient; +import com.azure.ai.openai.OpenAIClientBuilder; +import com.azure.ai.openai.models.EmbeddingsOptions; +import com.azure.identity.DefaultAzureCredentialBuilder; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoClients; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; +import com.mongodb.client.AggregateIterable; +import com.mongodb.client.model.Indexes; +import com.mongodb.MongoClientSettings; +import com.mongodb.MongoCredential; +import com.mongodb.ConnectionString; +import org.bson.Document; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +/** + * Vector search application using HNSW index. + */ +public class HNSW { + private static final String SAMPLE_QUERY = "What are some hotels with good accessibility?"; + private static final String DATABASE_NAME = "travel"; + private static final String COLLECTION_NAME = "hotels_hnsw"; + private static final String VECTOR_INDEX_NAME = "vectorIndex_hnsw"; + + private final AppConfig config = new AppConfig(); + private final ObjectMapper objectMapper = new ObjectMapper(); + + public static void main(String[] args) { + new HNSW().run(); + System.exit(0); + } + + public void run() { + try (var mongoClient = createMongoClient()) { + var openAIClient = createOpenAIClient(); + + var database = mongoClient.getDatabase(DATABASE_NAME); + var collection = database.getCollection(COLLECTION_NAME, Document.class); + + // Drop and recreate collection + collection.drop(); + database.createCollection(COLLECTION_NAME); + System.out.println("Created collection: " + COLLECTION_NAME); + + // Load and insert data + var hotelData = loadHotelData(); + insertDataInBatches(collection, hotelData); + + // Create standard indexes + collection.createIndex(Indexes.ascending("HotelName")); + collection.createIndex(Indexes.ascending("Category")); + + // Create vector index + createVectorIndex(database); + + // Perform vector search + var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); + performVectorSearch(collection, queryEmbedding); + + } catch (Exception e) { + System.err.println("Error: " + e.getMessage()); + e.printStackTrace(); + } + } + + private MongoClient createMongoClient() { + var clusterName = config.get("MONGO_CLUSTER_NAME"); + var azureCredential = new DefaultAzureCredentialBuilder().build(); + + MongoCredential.OidcCallback callback = (MongoCredential.OidcCallbackContext context) -> { + var token = azureCredential.getToken( + new com.azure.core.credential.TokenRequestContext() + .addScopes("https://ossrdbms-aad.database.windows.net/.default") + ).block(); + + if (token == null) { + throw new RuntimeException("Failed to obtain Azure AD token"); + } + + return new MongoCredential.OidcCallbackResult(token.getToken()); + }; + + var credential = MongoCredential.createOidcCredential(null) + .withMechanismProperty("OIDC_CALLBACK", callback); + + var connectionString = new ConnectionString( + String.format("mongodb+srv://%s.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", clusterName) + ); + + var settings = MongoClientSettings.builder() + .applyConnectionString(connectionString) + .credential(credential) + .build(); + + return MongoClients.create(settings); + } + + private OpenAIClient createOpenAIClient() { + var endpoint = config.get("AZURE_OPENAI_EMBEDDING_ENDPOINT"); + var credential = new DefaultAzureCredentialBuilder().build(); + + return new OpenAIClientBuilder() + .endpoint(endpoint) + .credential(credential) + .buildClient(); + } + + private List loadHotelData() throws IOException { + var dataFile = config.getOrDefault("DATA_FILE_WITH_VECTORS", "HotelsData_toCosmosDB_Vector.json"); + var filePath = Path.of(dataFile); + + System.out.println("Reading JSON file from " + filePath.toAbsolutePath()); + var jsonContent = Files.readString(filePath); + + return objectMapper.readValue(jsonContent, new TypeReference>() {}); + } + + private void insertDataInBatches(MongoCollection collection, List hotelData) { + var batchSize = config.getIntOrDefault("LOAD_SIZE_BATCH", 100); + var batches = partitionList(hotelData, batchSize); + + System.out.println("Processing in batches of " + batchSize + "..."); + + for (int i = 0; i < batches.size(); i++) { + var batch = batches.get(i); + var documents = batch.stream() + .map(this::convertToDocument) + .toList(); + + collection.insertMany(documents); + System.out.println("Batch " + (i + 1) + " complete: " + documents.size() + " inserted"); + } + } + + private Document convertToDocument(HotelData hotel) { + try { + var json = objectMapper.writeValueAsString(hotel); + return Document.parse(json); + } catch (Exception e) { + throw new RuntimeException("Failed to convert hotel to document", e); + } + } + + private void createVectorIndex(MongoDatabase database) { + var indexDefinition = new Document() + .append("createIndexes", COLLECTION_NAME) + .append("indexes", List.of( + new Document() + .append("name", VECTOR_INDEX_NAME) + .append("key", new Document("text_embedding_ada_002", "cosmosSearch")) + .append("cosmosSearchOptions", new Document() + .append("kind", "vector-hnsw") + .append("dimensions", config.getIntOrDefault("EMBEDDING_DIMENSIONS", 1536)) + .append("similarity", "COS") + .append("m", 16) + .append("efConstruction", 64) + ) + )); + + database.runCommand(indexDefinition); + System.out.println("Created vector index: " + VECTOR_INDEX_NAME); + } + + private List createEmbedding(OpenAIClient openAIClient, String text) { + var model = config.getOrDefault("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"); + var options = new EmbeddingsOptions(List.of(text)); + + var response = openAIClient.getEmbeddings(model, options); + return response.getData().get(0).getEmbedding().stream() + .map(Float::doubleValue) + .toList(); + } + + private void performVectorSearch(MongoCollection collection, List queryEmbedding) { + var searchStage = new Document("$search", new Document() + .append("cosmosSearch", new Document() + .append("vector", queryEmbedding) + .append("path", "text_embedding_ada_002") + .append("k", 5) + ) + ); + + var projectStage = new Document("$project", new Document() + .append("HotelName", 1) + .append("score", new Document("$meta", "searchScore")) + ); + + var pipeline = List.of(searchStage, projectStage); + + System.out.println("\nVector search results for: \"" + SAMPLE_QUERY + "\""); + + AggregateIterable results = collection.aggregate(pipeline); + var rank = 1; + + for (var result : results) { + var hotelName = result.getString("HotelName"); + var score = result.getDouble("score"); + System.out.printf("%d. HotelName: %s, Score: %.4f%n", rank++, hotelName, score); + } + } + + private static List> partitionList(List list, int batchSize) { + var partitions = new ArrayList>(); + for (int i = 0; i < list.size(); i += batchSize) { + partitions.add(list.subList(i, Math.min(i + batchSize, list.size()))); + } + return partitions; + } +} diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HotelData.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HotelData.java new file mode 100644 index 0000000..51024bb --- /dev/null +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HotelData.java @@ -0,0 +1,31 @@ +package com.azure.documentdb.samples; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonProperty; +import java.util.List; + +/** + * Represents hotel data with vector embeddings. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public record HotelData( + @JsonProperty("HotelId") String hotelId, + @JsonProperty("HotelName") String hotelName, + @JsonProperty("Description") String description, + @JsonProperty("Category") String category, + @JsonProperty("Tags") List tags, + @JsonProperty("ParkingIncluded") boolean parkingIncluded, + @JsonProperty("SmokingAllowed") boolean smokingAllowed, + @JsonProperty("LastRenovationDate") String lastRenovationDate, + @JsonProperty("Rating") double rating, + @JsonProperty("Address") Address address, + @JsonProperty("text_embedding_ada_002") List textEmbeddingAda002 +) { + public record Address( + @JsonProperty("StreetAddress") String streetAddress, + @JsonProperty("City") String city, + @JsonProperty("StateProvince") String stateProvince, + @JsonProperty("PostalCode") String postalCode, + @JsonProperty("Country") String country + ) {} +} diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java new file mode 100644 index 0000000..7126df1 --- /dev/null +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java @@ -0,0 +1,218 @@ +package com.azure.documentdb.samples; + +import com.azure.ai.openai.OpenAIClient; +import com.azure.ai.openai.OpenAIClientBuilder; +import com.azure.ai.openai.models.EmbeddingsOptions; +import com.azure.identity.DefaultAzureCredentialBuilder; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoClients; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; +import com.mongodb.client.AggregateIterable; +import com.mongodb.client.model.Indexes; +import com.mongodb.MongoClientSettings; +import com.mongodb.MongoCredential; +import com.mongodb.ConnectionString; +import org.bson.Document; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +/** + * Vector search application using IVF index. + */ +public class IVF { + private static final String SAMPLE_QUERY = "What are some hotels with good accessibility?"; + private static final String DATABASE_NAME = "travel"; + private static final String COLLECTION_NAME = "hotels_ivf"; + private static final String VECTOR_INDEX_NAME = "vectorIndex_ivf"; + + private final AppConfig config = new AppConfig(); + private final ObjectMapper objectMapper = new ObjectMapper(); + + public static void main(String[] args) { + new IVF().run(); + System.exit(0); + } + + public void run() { + try (var mongoClient = createMongoClient()) { + var openAIClient = createOpenAIClient(); + + var database = mongoClient.getDatabase(DATABASE_NAME); + var collection = database.getCollection(COLLECTION_NAME, Document.class); + + // Drop and recreate collection + collection.drop(); + database.createCollection(COLLECTION_NAME); + System.out.println("Created collection: " + COLLECTION_NAME); + + // Load and insert data + var hotelData = loadHotelData(); + insertDataInBatches(collection, hotelData); + + // Create standard indexes + collection.createIndex(Indexes.ascending("HotelName")); + collection.createIndex(Indexes.ascending("Category")); + + // Create vector index + createVectorIndex(database); + + // Perform vector search + var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); + performVectorSearch(collection, queryEmbedding); + + } catch (Exception e) { + System.err.println("Error: " + e.getMessage()); + e.printStackTrace(); + } + } + + private MongoClient createMongoClient() { + var clusterName = config.get("MONGO_CLUSTER_NAME"); + var azureCredential = new DefaultAzureCredentialBuilder().build(); + + MongoCredential.OidcCallback callback = (MongoCredential.OidcCallbackContext context) -> { + var token = azureCredential.getToken( + new com.azure.core.credential.TokenRequestContext() + .addScopes("https://ossrdbms-aad.database.windows.net/.default") + ).block(); + + if (token == null) { + throw new RuntimeException("Failed to obtain Azure AD token"); + } + + return new MongoCredential.OidcCallbackResult(token.getToken()); + }; + + var credential = MongoCredential.createOidcCredential(null) + .withMechanismProperty("OIDC_CALLBACK", callback); + + var connectionString = new ConnectionString( + String.format("mongodb+srv://%s.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", clusterName) + ); + + var settings = MongoClientSettings.builder() + .applyConnectionString(connectionString) + .credential(credential) + .build(); + + return MongoClients.create(settings); + } + + private OpenAIClient createOpenAIClient() { + var endpoint = config.get("AZURE_OPENAI_EMBEDDING_ENDPOINT"); + var credential = new DefaultAzureCredentialBuilder().build(); + + return new OpenAIClientBuilder() + .endpoint(endpoint) + .credential(credential) + .buildClient(); + } + + private List loadHotelData() throws IOException { + var dataFile = config.getOrDefault("DATA_FILE_WITH_VECTORS", "HotelsData_toCosmosDB_Vector.json"); + var filePath = Path.of(dataFile); + + System.out.println("Reading JSON file from " + filePath.toAbsolutePath()); + var jsonContent = Files.readString(filePath); + + return objectMapper.readValue(jsonContent, new TypeReference>() {}); + } + + private void insertDataInBatches(MongoCollection collection, List hotelData) { + var batchSize = config.getIntOrDefault("LOAD_SIZE_BATCH", 100); + var batches = partitionList(hotelData, batchSize); + + System.out.println("Processing in batches of " + batchSize + "..."); + + for (int i = 0; i < batches.size(); i++) { + var batch = batches.get(i); + var documents = batch.stream() + .map(this::convertToDocument) + .toList(); + + collection.insertMany(documents); + System.out.println("Batch " + (i + 1) + " complete: " + documents.size() + " inserted"); + } + } + + private Document convertToDocument(HotelData hotel) { + try { + var json = objectMapper.writeValueAsString(hotel); + return Document.parse(json); + } catch (Exception e) { + throw new RuntimeException("Failed to convert hotel to document", e); + } + } + + private void createVectorIndex(MongoDatabase database) { + var indexDefinition = new Document() + .append("createIndexes", COLLECTION_NAME) + .append("indexes", List.of( + new Document() + .append("name", VECTOR_INDEX_NAME) + .append("key", new Document("text_embedding_ada_002", "cosmosSearch")) + .append("cosmosSearchOptions", new Document() + .append("kind", "vector-ivf") + .append("dimensions", config.getIntOrDefault("EMBEDDING_DIMENSIONS", 1536)) + .append("similarity", "COS") + .append("numLists", 1) + ) + )); + + database.runCommand(indexDefinition); + System.out.println("Created vector index: " + VECTOR_INDEX_NAME); + } + + private List createEmbedding(OpenAIClient openAIClient, String text) { + var model = config.getOrDefault("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"); + var options = new EmbeddingsOptions(List.of(text)); + + var response = openAIClient.getEmbeddings(model, options); + return response.getData().get(0).getEmbedding().stream() + .map(Float::doubleValue) + .toList(); + } + + private void performVectorSearch(MongoCollection collection, List queryEmbedding) { + var searchStage = new Document("$search", new Document() + .append("cosmosSearch", new Document() + .append("vector", queryEmbedding) + .append("path", "text_embedding_ada_002") + .append("k", 5) + ) + ); + + var projectStage = new Document("$project", new Document() + .append("HotelName", 1) + .append("score", new Document("$meta", "searchScore")) + ); + + var pipeline = List.of(searchStage, projectStage); + + System.out.println("\nVector search results for: \"" + SAMPLE_QUERY + "\""); + + AggregateIterable results = collection.aggregate(pipeline); + var rank = 1; + + for (var result : results) { + var hotelName = result.getString("HotelName"); + var score = result.getDouble("score"); + System.out.printf("%d. HotelName: %s, Score: %.4f%n", rank++, hotelName, score); + } + } + + private static List> partitionList(List list, int batchSize) { + var partitions = new ArrayList>(); + for (int i = 0; i < list.size(); i += batchSize) { + partitions.add(list.subList(i, Math.min(i + batchSize, list.size()))); + } + return partitions; + } +} diff --git a/ai/vector-search-java/src/main/resources/application.properties b/ai/vector-search-java/src/main/resources/application.properties new file mode 100644 index 0000000..189a070 --- /dev/null +++ b/ai/vector-search-java/src/main/resources/application.properties @@ -0,0 +1,14 @@ + # Azure OpenAI Embedding Settings + AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-ada-002 + AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15 + AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com/ + EMBEDDING_SIZE_BATCH=16 + + # MongoDB configuration + MONGO_CLUSTER_NAME= + + # Data file + DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json + EMBEDDED_FIELD=text_embedding_ada_002 + EMBEDDING_DIMENSIONS=1536 + LOAD_SIZE_BATCH=100 \ No newline at end of file From 7e5f1befa30f65172e78b67fd8afe7cc5ce341e9 Mon Sep 17 00:00:00 2001 From: Karl Erickson <1775795+KarlErickson@users.noreply.github.com> Date: Fri, 16 Jan 2026 14:53:49 -0800 Subject: [PATCH 2/3] removed application.properties, updated dependency versions in pom.xml --- ai/vector-search-java/pom.xml | 13 ++++++------- .../src/main/resources/application.properties | 14 -------------- 2 files changed, 6 insertions(+), 21 deletions(-) delete mode 100644 ai/vector-search-java/src/main/resources/application.properties diff --git a/ai/vector-search-java/pom.xml b/ai/vector-search-java/pom.xml index 30a04be..99de228 100644 --- a/ai/vector-search-java/pom.xml +++ b/ai/vector-search-java/pom.xml @@ -8,8 +8,7 @@ 1.0-SNAPSHOT - 21 - 21 + 25 UTF-8 @@ -17,22 +16,22 @@ org.mongodb mongodb-driver-sync - 5.6.1 + 5.6.2 com.azure azure-identity - 1.11.4 + 1.18.1 com.azure azure-ai-openai - 1.0.0-beta.8 + 1.0.0-beta.16 - com.fasterxml.jackson.core + tools.jackson.core jackson-databind - 2.15.3 + 3.0.3 org.slf4j diff --git a/ai/vector-search-java/src/main/resources/application.properties b/ai/vector-search-java/src/main/resources/application.properties deleted file mode 100644 index 189a070..0000000 --- a/ai/vector-search-java/src/main/resources/application.properties +++ /dev/null @@ -1,14 +0,0 @@ - # Azure OpenAI Embedding Settings - AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-ada-002 - AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15 - AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com/ - EMBEDDING_SIZE_BATCH=16 - - # MongoDB configuration - MONGO_CLUSTER_NAME= - - # Data file - DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json - EMBEDDED_FIELD=text_embedding_ada_002 - EMBEDDING_DIMENSIONS=1536 - LOAD_SIZE_BATCH=100 \ No newline at end of file From 031647687f37b3dc0085ed3d07f4aaf80b8c8a01 Mon Sep 17 00:00:00 2001 From: Karl Erickson <1775795+KarlErickson@users.noreply.github.com> Date: Fri, 16 Jan 2026 16:17:46 -0800 Subject: [PATCH 3/3] updated Java sample to align with TypeScript --- .devcontainer/java/devcontainer.json | 18 +- ai/vector-search-java/README.md | 311 +++++------------- ai/vector-search-java/pom.xml | 2 +- .../azure/documentdb/samples/AppConfig.java | 48 --- .../com/azure/documentdb/samples/DiskAnn.java | 157 ++++----- .../com/azure/documentdb/samples/HNSW.java | 155 ++++----- .../azure/documentdb/samples/HotelData.java | 31 -- .../com/azure/documentdb/samples/IVF.java | 157 ++++----- 8 files changed, 333 insertions(+), 546 deletions(-) delete mode 100644 ai/vector-search-java/src/main/java/com/azure/documentdb/samples/AppConfig.java delete mode 100644 ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HotelData.java diff --git a/.devcontainer/java/devcontainer.json b/.devcontainer/java/devcontainer.json index fcd9c3a..6b631e0 100644 --- a/.devcontainer/java/devcontainer.json +++ b/.devcontainer/java/devcontainer.json @@ -1,24 +1,20 @@ // For format details, see https://aka.ms/devcontainer.json. For config options, see the -// README at: https://github.com/devcontainers/templates/tree/main/src/java +// README at: https://github.com/devcontainers/templates/tree/main/src/universal { "name": "Default Java", // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile "image": "mcr.microsoft.com/devcontainers/java:latest", "features": { - "ghcr.io/devcontainers/features/azure-cli:1": {}, - "ghcr.io/devcontainers/features/docker-in-docker:2": {}, - "ghcr.io/azure/azure-dev/azd:0": {} + "ghcr.io/devcontainers/features/java:1": { + "version": "none", + "installMaven": "true" + }, + "ghcr.io/devcontainers/features/azure-cli:1": {} }, "customizations": { "vscode": { "extensions": [ - "ms-azuretools.vscode-cosmosdb", - "buildwithlayer.mongodb-integration-expert-qS6DB", - "mongodb.mongodb-vscode", - "ms-azuretools.vscode-documentdb", - "redhat.java", - "vscjava.vscode-maven", - "vscjava.vscode-gradle" + "ms-azuretools.vscode-documentdb" ] } } diff --git a/ai/vector-search-java/README.md b/ai/vector-search-java/README.md index 8628125..929c2df 100644 --- a/ai/vector-search-java/README.md +++ b/ai/vector-search-java/README.md @@ -1,153 +1,128 @@ -# Azure DocumentDB Vector Samples (Java) +# DocumentDB Vector Samples (Java) -This project demonstrates vector search capabilities using Azure DocumentDB with Java. It includes implementations of three different vector index types: DiskANN, HNSW, and IVF, along with helper methods for embedding generation and data management. +This project demonstrates vector search capabilities using Azure DocumentDB with Java. It includes implementations of three different vector index types: DiskANN, HNSW, and IVF. ## Overview Vector search enables semantic similarity searching by converting text into high-dimensional vector representations (embeddings) and finding the most similar vectors in the database. This project shows how to: - Generate embeddings using Azure OpenAI -- Store vectors in Azure DocumentDB +- Store vectors in DocumentDB - Create and use different types of vector indexes - Perform similarity searches with various algorithms -- Handle authentication using Azure Active Directory (passwordless) or connection strings ## Prerequisites Before running this project, you need: -### Azure resources +### Azure Resources 1. **Azure subscription** with appropriate permissions -2. **Azure OpenAI resource** with embedding model deployment -3. **Azure DocumentDB resource** -4. **Azure CLI** installed and configured +2. **[Azure Developer CLI (azd)](https://learn.microsoft.com/azure/developer/azure-developer-cli/)** installed -### Development environment - -- [Java 21 or higher](https://learn.microsoft.com/en-us/java/openjdk/download) +### Development Environment +- [Java 21 or higher](https://learn.microsoft.com/java/openjdk/download) - [Maven 3.6 or higher](https://maven.apache.org/download.cgi) - [Git](https://git-scm.com/downloads) (for cloning the repository) - [Visual Studio Code](https://code.visualstudio.com/) (recommended) or another Java IDE -## Setup instructions +## Setup Instructions -### Step 1: Clone and setup project +### Clone and Setup Project ```bash # Clone this repository -git clone https://github.com/Azure-Samples/cosmos-db-vector-samples.git -cd cosmos-db-vector-samples/mongo-vcore-vector-search-java - -# Compile the project -mvn clean compile -``` - -### Step 2: Create Azure resources - -#### Create Azure OpenAI resource - -```bash -# Login to Azure -az login - -# Create resource group (if needed) -az group create --name --location - -# Create Azure OpenAI resource -az cognitiveservices account create \ - --name \ - --resource-group \ - --location \ - --kind OpenAI \ - --sku S0 \ - --subscription +git clone https://github.com/Azure-Samples/documentdb-samples ``` -#### Deploy embedding model +### Deploy Azure Resources -1. Go to Azure OpenAI Studio (https://oai.azure.com/) -2. Navigate to your OpenAI resource -3. Go to **Model deployments** and create a new deployment -4. Choose **text-embedding-ada-002** model -5. Note the deployment name for configuration +This project uses Azure Developer CLI (azd) to deploy all required Azure resources from the existing infrastructure-as-code files. -#### Create Azure DocumentDB resource +#### Install Azure Developer CLI -Create a Azure DocumentDB cluster by using the [Azure portal](https://learn.microsoft.com/azure/documentdb/quickstart-portal), [Bicep](https://learn.microsoft.com/azure/documentdb/quickstart-bicep), or [Terraform](https://learn.microsoft.com/azure/documentdb/quickstart-terraform). +If you haven't already, install the Azure Developer CLI: -### Step 3: Get your resource information +**Windows:** +```powershell +winget install microsoft.azd +``` -#### Azure OpenAI endpoint +**macOS:** +```bash +brew tap azure/azd && brew install azd +``` +**Linux:** ```bash -# Get OpenAI endpoint -az cognitiveservices account show \ - --name \ - --resource-group \ - --query "properties.endpoint" --output tsv +curl -fsSL https://aka.ms/install-azd.sh | bash ``` -#### DocumentDB cluster name +#### Deploy Resources -You'll need your DocumentDB cluster name (e.g., `my-cluster`), which you can find in the Azure portal or retrieve using: +Navigate to the root of the repository and run: ```bash -# List DocumentDB clusters in your resource group -az resource list \ - --resource-group "" \ - --resource-type "Microsoft.DocumentDB/mongoClusters" \ - --query "[].name" --output tsv +# Login to Azure +azd auth login + +# Provision Azure resources +azd up ``` -### Step 4: Configure application properties +During provisioning, you'll be prompted for: +- **Environment name**: A unique name for your deployment (e.g., "my-vector-search") +- **Azure subscription**: Select your Azure subscription +- **Location**: Choose from `eastus2` or `swedencentral` (required for OpenAI models) -Edit the `src/main/resources/application.properties` file with your Azure resource information: +The `azd up` command will: +- Create a resource group +- Deploy Azure OpenAI with text-embedding-3-small model +- Deploy Azure DocumentDB (MongoDB vCore) cluster +- Create a managed identity for secure access +- Configure all necessary permissions and networking +- Generate a `.env` file with all connection information at the repository root -```properties -# Azure OpenAI Embedding Settings -AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-ada-002 -AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15 -AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com/ -EMBEDDING_SIZE_BATCH=16 +### Compile the Project -# MongoDB configuration -MONGO_CLUSTER_NAME= +```bash +# Move to Java vector search project +cd ai/vector-search-java -# Data file -DATA_FILE_WITH_VECTORS=../data/HotelsData_toCosmosDB_Vector.json -EMBEDDED_FIELD=text_embedding_ada_002 -EMBEDDING_DIMENSIONS=1536 -LOAD_SIZE_BATCH=100 +# Compile the project +mvn clean compile ``` -Alternatively, you can set these as environment variables which will take precedence over the properties file. +### Load Environment Variables -### Step 5: Configure passwordless authentication +After deployment completes, load the environment variables from the generated `.env` file. The `set -a` command ensures variables are exported to child processes (like the Maven JVM): -This sample uses passwordless authentication with Microsoft Entra ID for both Azure OpenAI and DocumentDB. Follow these steps to configure it: +```bash +# From the ai/vector-search-java directory +set -a && source ../../.env && set +a +``` -#### For Azure OpenAI +You can verify the environment variables are set: -Assign your Microsoft Entra ID user the following role on the Azure OpenAI resource: - - **Cognitive Services OpenAI User** (or **Cognitive Services OpenAI Contributor** for broader permissions) +```bash +echo $MONGO_CLUSTER_NAME +``` -#### For Azure DocumentDB +## Usage -1. In your Azure DocumentDB resource, enable **Native DocumentDB and Microsoft Entra ID** authentication methods. -2. Assign your Microsoft Entra ID user the following roles on the DocumentDB resource: - - **Cosmos DB Account Reader Role** - - **DocumentDB Account Contributor** +The project includes several Java classes that demonstrate different aspects of vector search. -## Usage +### Sign in to Azure for passwordless connection -The project includes several Java classes that demonstrate different aspects of vector search: +```bash +az login +``` -### 1. DiskANN vector search +### DiskANN Vector Search Run DiskANN (Disk-based Approximate Nearest Neighbor) search: ```bash -mvn compile exec:java -Dexec.mainClass="com.azure.documentdb.samples.DiskAnn" +mvn exec:java -Dexec.mainClass="com.azure.documentdb.samples.DiskAnn" ``` DiskANN is optimized for: @@ -155,12 +130,12 @@ DiskANN is optimized for: - Efficient disk-based storage - Good balance of speed and accuracy -### 2. HNSW vector search +### HNSW Vector Search Run HNSW (Hierarchical Navigable Small World) search: ```bash -mvn compile exec:java -Dexec.mainClass="com.azure.documentdb.samples.HNSW" +mvn exec:java -Dexec.mainClass="com.azure.documentdb.samples.HNSW" ``` HNSW provides: @@ -169,12 +144,12 @@ HNSW provides: - Hierarchical graph structure - Good for real-time applications -### 3. IVF vector search +### IVF Vector Search Run IVF (Inverted File) search: ```bash -mvn compile exec:java -Dexec.mainClass="com.azure.documentdb.samples.IVF" +mvn exec:java -Dexec.mainClass="com.azure.documentdb.samples.IVF" ``` IVF features: @@ -183,148 +158,20 @@ IVF features: - Configurable accuracy vs speed trade-offs - Efficient for large vector datasets -## Project structure - -``` -mongo-vcore-vector-search-java/ -├── pom.xml # Maven project configuration -├── src/ -│ └── main/ -│ ├── java/ -│ │ └── com/azure/documentdb/samples/ -│ │ ├── AppConfig.java # Configuration management -│ │ ├── DiskAnn.java # DiskANN vector search implementation -│ │ ├── HNSW.java # HNSW vector search implementation -│ │ ├── IVF.java # IVF vector search implementation -│ │ └── HotelData.java # Hotel data model -│ └── resources/ -│ └── application.properties # Configuration settings -└── data/ # Hotel data files with vectors -``` - -## Important notes - -### Vector index limitations - -**One Index Per Field**: Azure DocumentDB allows only one vector index per field. Each sample automatically handles this by: - -1. **Dropping existing collections**: Before creating a new vector index, each sample drops and recreates the collection -2. **Safe switching**: You can run different vector index samples in any order - each will create a fresh collection with the appropriate index - -```bash -# Example: Switch between different vector index types -mvn compile exec:java -Dexec.mainClass="com.azure.documentdb.samples.DiskAnn" # Creates DiskANN index -mvn compile exec:java -Dexec.mainClass="com.azure.documentdb.samples.HNSW" # Creates HNSW index -mvn compile exec:java -Dexec.mainClass="com.azure.documentdb.samples.IVF" # Creates IVF index -``` - -**What this means**: -- You cannot have both DiskANN and HNSW indexes simultaneously on the same field -- Each run creates a new collection with fresh data and the appropriate vector index -- No manual cleanup required - -### Cluster tier requirements - -Different vector index types require different cluster tiers: - -- **IVF**: Available on most tiers (including basic) -- **HNSW**: Requires standard tier or higher -- **DiskANN**: Requires premium/high-performance tier. Available on M30 and above - -If you encounter "not enabled for this cluster tier" errors: -1. Try a different index type (IVF is most widely supported) -2. Consider upgrading your cluster tier -3. Check the [Azure DocumentDB pricing page](https://azure.microsoft.com/pricing/details/documentdb/) for tier features - -## Key features - -### Vector index types - -- **DiskANN**: Optimized for large datasets with disk-based storage -- **HNSW**: High-performance hierarchical graph structure -- **IVF**: Clustering-based approach with configurable accuracy - -### Authentication - -- Passwordless authentication with Microsoft Entra ID using DefaultAzureCredential -- Azure AD authentication and RBAC for enhanced security -- Automatic token rotation and renewal - -### Sample data - -- Real hotel dataset with descriptions, locations, and amenities -- Pre-configured for embedding generation -- Includes various hotel types and price ranges - -## Troubleshooting - -### Common issues - -1. **Authentication Errors** - - Ensure Azure CLI is logged in: `az login` - - Verify you have proper RBAC permissions on Azure DocumentDB resource - - Check that Microsoft Entra ID is enabled for your DocumentDB resource - - Verify you have **Cosmos DB Account Reader Role** and **DocumentDB Account Contributor** roles assigned. Roles may take some time to propagate. - -2. **Embedding Generation Fails** - - Check Azure OpenAI model deployment name - - Verify API version compatibility - - Ensure API endpoint is accessible - -3. **Vector Search Returns No Results** - - Ensure data was inserted into collection successfully - - Verify vector indexes are built properly - - Check that embeddings match the expected dimensions - -4. **Compilation Issues** - - Verify Java 21 or higher is installed: `java -version` - - Verify Maven is installed: `mvn -version` - - Run `mvn clean install` to rebuild the project - -5. **Connection Issues** - - Ensure firewall rules allow your IP address - - Check that the cluster is running - - Verify `MONGO_CLUSTER_NAME` is set correctly - -## Performance considerations - -### Choosing vector index types - -- **Use DiskANN when**: Dataset is very large, memory is limited, vector count is up to 500,000+ -- **Use HNSW when**: Need fastest search, have sufficient memory, vector count is up to 50,000 -- **Use IVF when**: Want configurable accuracy/speed trade-offs, vector count is under 10,000 - -### Tuning parameters - -- **Batch sizes**: Adjust `LOAD_SIZE_BATCH` and `EMBEDDING_SIZE_BATCH` based on API rate limits and memory -- **Vector dimensions**: Must match your embedding model (1536 for text-embedding-ada-002) -- **Index parameters**: Tune for your specific accuracy/speed requirements - -### Cost optimization - -- Use appropriate Azure OpenAI pricing tier -- Monitor API usage and optimize batch processing - -## Further resources +## Further Resources +- [Azure Developer CLI Documentation](https://learn.microsoft.com/azure/developer/azure-developer-cli/) - [Azure DocumentDB Documentation](https://learn.microsoft.com/azure/documentdb/) -- [Azure OpenAI Service Documentation](https://learn.microsoft.com/azure/cognitive-services/openai/) -- [Vector Search in Azure DocumentDB](https://learn.microsoft.com/azure/cosmos-db/vector-database) +- [Azure OpenAI Service Documentation](https://learn.microsoft.com/azure/ai-services/openai/) +- [Vector Search in DocumentDB](https://learn.microsoft.com/azure/documentdb/vector-search) - [MongoDB Java Driver Documentation](https://mongodb.github.io/mongo-java-driver/) - [Azure SDK for Java Documentation](https://learn.microsoft.com/java/api/overview/azure/) ## Support If you encounter issues: -1. Check the troubleshooting section above -2. Review Azure resource configurations -3. Verify environment variable settings -4. Check Azure service status and quotas - -## License - -This project is licensed under the MIT License - see the [LICENSE](../LICENSE.md) file for details. - -## Contributing - -Contributions are welcome! Please see [CONTRIBUTING](../CONTRIBUTING.md) for details. +1. Verify Java 21+ is installed: `java -version` +2. Verify Maven is installed: `mvn -version` +3. Ensure Azure CLI is logged in: `az login` +4. Verify environment variables are exported: `echo $MONGO_CLUSTER_NAME` +5. Check Azure service status and quotas diff --git a/ai/vector-search-java/pom.xml b/ai/vector-search-java/pom.xml index 99de228..3c83df2 100644 --- a/ai/vector-search-java/pom.xml +++ b/ai/vector-search-java/pom.xml @@ -8,7 +8,7 @@ 1.0-SNAPSHOT - 25 + 21 UTF-8 diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/AppConfig.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/AppConfig.java deleted file mode 100644 index bef1597..0000000 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/AppConfig.java +++ /dev/null @@ -1,48 +0,0 @@ -package com.azure.documentdb.samples; - -import java.io.IOException; -import java.io.InputStream; -import java.util.HashMap; -import java.util.Map; -import java.util.Properties; - -/** - * Application configuration loaded from environment variables and application.properties file. - */ -public class AppConfig { - private final Map config = new HashMap<>(); - - public AppConfig() { - loadFromEnvironment(); - loadFromPropertiesFile(); - } - - private void loadFromEnvironment() { - System.getenv().forEach(config::put); - } - - private void loadFromPropertiesFile() { - try (InputStream input = getClass().getClassLoader().getResourceAsStream("application.properties")) { - if (input != null) { - Properties properties = new Properties(); - properties.load(input); - properties.forEach((key, value) -> config.putIfAbsent(key.toString(), value.toString())); - } - } catch (IOException e) { - System.err.println("Warning: Could not read application.properties file: " + e.getMessage()); - } - } - - public String get(String key) { - return config.get(key); - } - - public String getOrDefault(String key, String defaultValue) { - return config.getOrDefault(key, defaultValue); - } - - public int getIntOrDefault(String key, int defaultValue) { - var value = config.get(key); - return value != null ? Integer.parseInt(value) : defaultValue; - } -} diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java index 322ad7c..676630b 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java @@ -4,209 +4,218 @@ import com.azure.ai.openai.OpenAIClientBuilder; import com.azure.ai.openai.models.EmbeddingsOptions; import com.azure.identity.DefaultAzureCredentialBuilder; -import com.fasterxml.jackson.core.type.TypeReference; -import com.fasterxml.jackson.databind.ObjectMapper; +import com.mongodb.ConnectionString; +import com.mongodb.MongoClientSettings; +import com.mongodb.MongoCredential; +import com.mongodb.client.AggregateIterable; import com.mongodb.client.MongoClient; import com.mongodb.client.MongoClients; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; -import com.mongodb.client.AggregateIterable; import com.mongodb.client.model.Indexes; -import com.mongodb.MongoClientSettings; -import com.mongodb.MongoCredential; -import com.mongodb.ConnectionString; import org.bson.Document; +import tools.jackson.core.type.TypeReference; +import tools.jackson.databind.json.JsonMapper; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.Map; /** - * Vector search application using DiskANN index. + * Vector search sample using DiskANN index. */ public class DiskAnn { - private static final String SAMPLE_QUERY = "What are some hotels with good accessibility?"; - private static final String DATABASE_NAME = "travel"; + private static final String SAMPLE_QUERY = "quintessential lodging near running trails, eateries, retail"; + private static final String DATABASE_NAME = "Hotels"; private static final String COLLECTION_NAME = "hotels_diskann"; private static final String VECTOR_INDEX_NAME = "vectorIndex_diskann"; - - private final AppConfig config = new AppConfig(); - private final ObjectMapper objectMapper = new ObjectMapper(); - + + private final JsonMapper jsonMapper = JsonMapper.builder().build(); + public static void main(String[] args) { new DiskAnn().run(); System.exit(0); } - + public void run() { try (var mongoClient = createMongoClient()) { var openAIClient = createOpenAIClient(); - + var database = mongoClient.getDatabase(DATABASE_NAME); var collection = database.getCollection(COLLECTION_NAME, Document.class); - + // Drop and recreate collection collection.drop(); database.createCollection(COLLECTION_NAME); System.out.println("Created collection: " + COLLECTION_NAME); - + // Load and insert data var hotelData = loadHotelData(); insertDataInBatches(collection, hotelData); - + // Create standard indexes - collection.createIndex(Indexes.ascending("HotelName")); - collection.createIndex(Indexes.ascending("Category")); - + createStandardIndexes(collection); + // Create vector index createVectorIndex(database); - + // Perform vector search var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); performVectorSearch(collection, queryEmbedding); - + } catch (Exception e) { System.err.println("Error: " + e.getMessage()); e.printStackTrace(); } } - + private MongoClient createMongoClient() { - var clusterName = config.get("MONGO_CLUSTER_NAME"); + var clusterName = System.getenv("MONGO_CLUSTER_NAME"); + var managedIdentityPrincipalId = System.getenv("AZURE_MANAGED_IDENTITY_PRINCIPAL_ID"); var azureCredential = new DefaultAzureCredentialBuilder().build(); - + MongoCredential.OidcCallback callback = (MongoCredential.OidcCallbackContext context) -> { var token = azureCredential.getToken( new com.azure.core.credential.TokenRequestContext() .addScopes("https://ossrdbms-aad.database.windows.net/.default") ).block(); - + if (token == null) { throw new RuntimeException("Failed to obtain Azure AD token"); } - + return new MongoCredential.OidcCallbackResult(token.getToken()); }; - + var credential = MongoCredential.createOidcCredential(null) .withMechanismProperty("OIDC_CALLBACK", callback); - + var connectionString = new ConnectionString( - String.format("mongodb+srv://%s.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", clusterName) + String.format("mongodb+srv://%s@%s.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", + managedIdentityPrincipalId, clusterName) ); - + var settings = MongoClientSettings.builder() .applyConnectionString(connectionString) .credential(credential) .build(); - + return MongoClients.create(settings); } - + private OpenAIClient createOpenAIClient() { - var endpoint = config.get("AZURE_OPENAI_EMBEDDING_ENDPOINT"); + var endpoint = System.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"); var credential = new DefaultAzureCredentialBuilder().build(); - + return new OpenAIClientBuilder() .endpoint(endpoint) .credential(credential) .buildClient(); } - - private List loadHotelData() throws IOException { - var dataFile = config.getOrDefault("DATA_FILE_WITH_VECTORS", "HotelsData_toCosmosDB_Vector.json"); + + private List> loadHotelData() throws IOException { + var dataFile = System.getenv("DATA_FILE_WITH_VECTORS"); var filePath = Path.of(dataFile); - + System.out.println("Reading JSON file from " + filePath.toAbsolutePath()); var jsonContent = Files.readString(filePath); - - return objectMapper.readValue(jsonContent, new TypeReference>() {}); + + return jsonMapper.readValue(jsonContent, new TypeReference>>() {}); } - - private void insertDataInBatches(MongoCollection collection, List hotelData) { - var batchSize = config.getIntOrDefault("LOAD_SIZE_BATCH", 100); + + private void insertDataInBatches(MongoCollection collection, List> hotelData) { + var batchSizeStr = System.getenv("LOAD_SIZE_BATCH"); + var batchSize = batchSizeStr != null ? Integer.parseInt(batchSizeStr) : 100; var batches = partitionList(hotelData, batchSize); - + System.out.println("Processing in batches of " + batchSize + "..."); - + for (int i = 0; i < batches.size(); i++) { var batch = batches.get(i); var documents = batch.stream() - .map(this::convertToDocument) + .map(Document::new) .toList(); - + collection.insertMany(documents); System.out.println("Batch " + (i + 1) + " complete: " + documents.size() + " inserted"); } } - - private Document convertToDocument(HotelData hotel) { - try { - var json = objectMapper.writeValueAsString(hotel); - return Document.parse(json); - } catch (Exception e) { - throw new RuntimeException("Failed to convert hotel to document", e); - } + + private void createStandardIndexes(MongoCollection collection) { + collection.createIndex(Indexes.ascending("HotelId")); + collection.createIndex(Indexes.ascending("Category")); + collection.createIndex(Indexes.ascending("Description")); + collection.createIndex(Indexes.ascending("Description_fr")); } - + private void createVectorIndex(MongoDatabase database) { + var embeddedField = System.getenv("EMBEDDED_FIELD"); + var dimensionsStr = System.getenv("EMBEDDING_DIMENSIONS"); + var dimensions = dimensionsStr != null ? Integer.parseInt(dimensionsStr) : 1536; + var indexDefinition = new Document() .append("createIndexes", COLLECTION_NAME) .append("indexes", List.of( new Document() .append("name", VECTOR_INDEX_NAME) - .append("key", new Document("text_embedding_ada_002", "cosmosSearch")) + .append("key", new Document(embeddedField, "cosmosSearch")) .append("cosmosSearchOptions", new Document() .append("kind", "vector-diskann") - .append("dimensions", config.getIntOrDefault("EMBEDDING_DIMENSIONS", 1536)) + .append("dimensions", dimensions) .append("similarity", "COS") + .append("maxDegree", 20) + .append("lBuild", 10) ) )); - + database.runCommand(indexDefinition); System.out.println("Created vector index: " + VECTOR_INDEX_NAME); } - + private List createEmbedding(OpenAIClient openAIClient, String text) { - var model = config.getOrDefault("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"); + var model = System.getenv("AZURE_OPENAI_EMBEDDING_MODEL"); var options = new EmbeddingsOptions(List.of(text)); - + var response = openAIClient.getEmbeddings(model, options); return response.getData().get(0).getEmbedding().stream() .map(Float::doubleValue) .toList(); } - + private void performVectorSearch(MongoCollection collection, List queryEmbedding) { + var embeddedField = System.getenv("EMBEDDED_FIELD"); + var searchStage = new Document("$search", new Document() .append("cosmosSearch", new Document() .append("vector", queryEmbedding) - .append("path", "text_embedding_ada_002") + .append("path", embeddedField) .append("k", 5) ) ); - + var projectStage = new Document("$project", new Document() - .append("HotelName", 1) .append("score", new Document("$meta", "searchScore")) + .append("document", "$$ROOT") ); - + var pipeline = List.of(searchStage, projectStage); - + System.out.println("\nVector search results for: \"" + SAMPLE_QUERY + "\""); - + AggregateIterable results = collection.aggregate(pipeline); var rank = 1; - + for (var result : results) { - var hotelName = result.getString("HotelName"); + var document = result.get("document", Document.class); + var hotelName = document.getString("HotelName"); var score = result.getDouble("score"); System.out.printf("%d. HotelName: %s, Score: %.4f%n", rank++, hotelName, score); } } - + private static List> partitionList(List list, int batchSize) { var partitions = new ArrayList>(); for (int i = 0; i < list.size(); i += batchSize) { diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java index ca580f3..146fc27 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java @@ -4,211 +4,218 @@ import com.azure.ai.openai.OpenAIClientBuilder; import com.azure.ai.openai.models.EmbeddingsOptions; import com.azure.identity.DefaultAzureCredentialBuilder; -import com.fasterxml.jackson.core.type.TypeReference; -import com.fasterxml.jackson.databind.ObjectMapper; +import com.mongodb.ConnectionString; +import com.mongodb.MongoClientSettings; +import com.mongodb.MongoCredential; +import com.mongodb.client.AggregateIterable; import com.mongodb.client.MongoClient; import com.mongodb.client.MongoClients; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; -import com.mongodb.client.AggregateIterable; import com.mongodb.client.model.Indexes; -import com.mongodb.MongoClientSettings; -import com.mongodb.MongoCredential; -import com.mongodb.ConnectionString; import org.bson.Document; +import tools.jackson.core.type.TypeReference; +import tools.jackson.databind.json.JsonMapper; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.Map; /** - * Vector search application using HNSW index. + * Vector search sample using HNSW index. */ public class HNSW { - private static final String SAMPLE_QUERY = "What are some hotels with good accessibility?"; - private static final String DATABASE_NAME = "travel"; + private static final String SAMPLE_QUERY = "quintessential lodging near running trails, eateries, retail"; + private static final String DATABASE_NAME = "Hotels"; private static final String COLLECTION_NAME = "hotels_hnsw"; private static final String VECTOR_INDEX_NAME = "vectorIndex_hnsw"; - - private final AppConfig config = new AppConfig(); - private final ObjectMapper objectMapper = new ObjectMapper(); - + + private final JsonMapper jsonMapper = JsonMapper.builder().build(); + public static void main(String[] args) { new HNSW().run(); System.exit(0); } - + public void run() { try (var mongoClient = createMongoClient()) { var openAIClient = createOpenAIClient(); - + var database = mongoClient.getDatabase(DATABASE_NAME); var collection = database.getCollection(COLLECTION_NAME, Document.class); - + // Drop and recreate collection collection.drop(); database.createCollection(COLLECTION_NAME); System.out.println("Created collection: " + COLLECTION_NAME); - + // Load and insert data var hotelData = loadHotelData(); insertDataInBatches(collection, hotelData); - + // Create standard indexes - collection.createIndex(Indexes.ascending("HotelName")); - collection.createIndex(Indexes.ascending("Category")); - + createStandardIndexes(collection); + // Create vector index createVectorIndex(database); - + // Perform vector search var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); performVectorSearch(collection, queryEmbedding); - + } catch (Exception e) { System.err.println("Error: " + e.getMessage()); e.printStackTrace(); } } - + private MongoClient createMongoClient() { - var clusterName = config.get("MONGO_CLUSTER_NAME"); + var clusterName = System.getenv("MONGO_CLUSTER_NAME"); + var managedIdentityPrincipalId = System.getenv("AZURE_MANAGED_IDENTITY_PRINCIPAL_ID"); var azureCredential = new DefaultAzureCredentialBuilder().build(); - + MongoCredential.OidcCallback callback = (MongoCredential.OidcCallbackContext context) -> { var token = azureCredential.getToken( new com.azure.core.credential.TokenRequestContext() .addScopes("https://ossrdbms-aad.database.windows.net/.default") ).block(); - + if (token == null) { throw new RuntimeException("Failed to obtain Azure AD token"); } - + return new MongoCredential.OidcCallbackResult(token.getToken()); }; - + var credential = MongoCredential.createOidcCredential(null) .withMechanismProperty("OIDC_CALLBACK", callback); - + var connectionString = new ConnectionString( - String.format("mongodb+srv://%s.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", clusterName) + String.format("mongodb+srv://%s@%s.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", + managedIdentityPrincipalId, clusterName) ); - + var settings = MongoClientSettings.builder() .applyConnectionString(connectionString) .credential(credential) .build(); - + return MongoClients.create(settings); } - + private OpenAIClient createOpenAIClient() { - var endpoint = config.get("AZURE_OPENAI_EMBEDDING_ENDPOINT"); + var endpoint = System.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"); var credential = new DefaultAzureCredentialBuilder().build(); - + return new OpenAIClientBuilder() .endpoint(endpoint) .credential(credential) .buildClient(); } - - private List loadHotelData() throws IOException { - var dataFile = config.getOrDefault("DATA_FILE_WITH_VECTORS", "HotelsData_toCosmosDB_Vector.json"); + + private List> loadHotelData() throws IOException { + var dataFile = System.getenv("DATA_FILE_WITH_VECTORS"); var filePath = Path.of(dataFile); - + System.out.println("Reading JSON file from " + filePath.toAbsolutePath()); var jsonContent = Files.readString(filePath); - - return objectMapper.readValue(jsonContent, new TypeReference>() {}); + + return jsonMapper.readValue(jsonContent, new TypeReference>>() {}); } - - private void insertDataInBatches(MongoCollection collection, List hotelData) { - var batchSize = config.getIntOrDefault("LOAD_SIZE_BATCH", 100); + + private void insertDataInBatches(MongoCollection collection, List> hotelData) { + var batchSizeStr = System.getenv("LOAD_SIZE_BATCH"); + var batchSize = batchSizeStr != null ? Integer.parseInt(batchSizeStr) : 100; var batches = partitionList(hotelData, batchSize); - + System.out.println("Processing in batches of " + batchSize + "..."); - + for (int i = 0; i < batches.size(); i++) { var batch = batches.get(i); var documents = batch.stream() - .map(this::convertToDocument) + .map(Document::new) .toList(); - + collection.insertMany(documents); System.out.println("Batch " + (i + 1) + " complete: " + documents.size() + " inserted"); } } - - private Document convertToDocument(HotelData hotel) { - try { - var json = objectMapper.writeValueAsString(hotel); - return Document.parse(json); - } catch (Exception e) { - throw new RuntimeException("Failed to convert hotel to document", e); - } + + private void createStandardIndexes(MongoCollection collection) { + collection.createIndex(Indexes.ascending("HotelId")); + collection.createIndex(Indexes.ascending("Category")); + collection.createIndex(Indexes.ascending("Description")); + collection.createIndex(Indexes.ascending("Description_fr")); } - + private void createVectorIndex(MongoDatabase database) { + var embeddedField = System.getenv("EMBEDDED_FIELD"); + var dimensionsStr = System.getenv("EMBEDDING_DIMENSIONS"); + var dimensions = dimensionsStr != null ? Integer.parseInt(dimensionsStr) : 1536; + var indexDefinition = new Document() .append("createIndexes", COLLECTION_NAME) .append("indexes", List.of( new Document() .append("name", VECTOR_INDEX_NAME) - .append("key", new Document("text_embedding_ada_002", "cosmosSearch")) + .append("key", new Document(embeddedField, "cosmosSearch")) .append("cosmosSearchOptions", new Document() .append("kind", "vector-hnsw") - .append("dimensions", config.getIntOrDefault("EMBEDDING_DIMENSIONS", 1536)) + .append("dimensions", dimensions) .append("similarity", "COS") .append("m", 16) .append("efConstruction", 64) ) )); - + database.runCommand(indexDefinition); System.out.println("Created vector index: " + VECTOR_INDEX_NAME); } - + private List createEmbedding(OpenAIClient openAIClient, String text) { - var model = config.getOrDefault("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"); + var model = System.getenv("AZURE_OPENAI_EMBEDDING_MODEL"); var options = new EmbeddingsOptions(List.of(text)); - + var response = openAIClient.getEmbeddings(model, options); return response.getData().get(0).getEmbedding().stream() .map(Float::doubleValue) .toList(); } - + private void performVectorSearch(MongoCollection collection, List queryEmbedding) { + var embeddedField = System.getenv("EMBEDDED_FIELD"); + var searchStage = new Document("$search", new Document() .append("cosmosSearch", new Document() .append("vector", queryEmbedding) - .append("path", "text_embedding_ada_002") + .append("path", embeddedField) .append("k", 5) ) ); - + var projectStage = new Document("$project", new Document() - .append("HotelName", 1) .append("score", new Document("$meta", "searchScore")) + .append("document", "$$ROOT") ); - + var pipeline = List.of(searchStage, projectStage); - + System.out.println("\nVector search results for: \"" + SAMPLE_QUERY + "\""); - + AggregateIterable results = collection.aggregate(pipeline); var rank = 1; - + for (var result : results) { - var hotelName = result.getString("HotelName"); + var document = result.get("document", Document.class); + var hotelName = document.getString("HotelName"); var score = result.getDouble("score"); System.out.printf("%d. HotelName: %s, Score: %.4f%n", rank++, hotelName, score); } } - + private static List> partitionList(List list, int batchSize) { var partitions = new ArrayList>(); for (int i = 0; i < list.size(); i += batchSize) { diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HotelData.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HotelData.java deleted file mode 100644 index 51024bb..0000000 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HotelData.java +++ /dev/null @@ -1,31 +0,0 @@ -package com.azure.documentdb.samples; - -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import com.fasterxml.jackson.annotation.JsonProperty; -import java.util.List; - -/** - * Represents hotel data with vector embeddings. - */ -@JsonIgnoreProperties(ignoreUnknown = true) -public record HotelData( - @JsonProperty("HotelId") String hotelId, - @JsonProperty("HotelName") String hotelName, - @JsonProperty("Description") String description, - @JsonProperty("Category") String category, - @JsonProperty("Tags") List tags, - @JsonProperty("ParkingIncluded") boolean parkingIncluded, - @JsonProperty("SmokingAllowed") boolean smokingAllowed, - @JsonProperty("LastRenovationDate") String lastRenovationDate, - @JsonProperty("Rating") double rating, - @JsonProperty("Address") Address address, - @JsonProperty("text_embedding_ada_002") List textEmbeddingAda002 -) { - public record Address( - @JsonProperty("StreetAddress") String streetAddress, - @JsonProperty("City") String city, - @JsonProperty("StateProvince") String stateProvince, - @JsonProperty("PostalCode") String postalCode, - @JsonProperty("Country") String country - ) {} -} diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java index 7126df1..e800107 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java @@ -4,210 +4,217 @@ import com.azure.ai.openai.OpenAIClientBuilder; import com.azure.ai.openai.models.EmbeddingsOptions; import com.azure.identity.DefaultAzureCredentialBuilder; -import com.fasterxml.jackson.core.type.TypeReference; -import com.fasterxml.jackson.databind.ObjectMapper; +import com.mongodb.ConnectionString; +import com.mongodb.MongoClientSettings; +import com.mongodb.MongoCredential; +import com.mongodb.client.AggregateIterable; import com.mongodb.client.MongoClient; import com.mongodb.client.MongoClients; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; -import com.mongodb.client.AggregateIterable; import com.mongodb.client.model.Indexes; -import com.mongodb.MongoClientSettings; -import com.mongodb.MongoCredential; -import com.mongodb.ConnectionString; import org.bson.Document; +import tools.jackson.core.type.TypeReference; +import tools.jackson.databind.json.JsonMapper; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.Map; /** - * Vector search application using IVF index. + * Vector search sample using IVF index. */ public class IVF { - private static final String SAMPLE_QUERY = "What are some hotels with good accessibility?"; - private static final String DATABASE_NAME = "travel"; + private static final String SAMPLE_QUERY = "quintessential lodging near running trails, eateries, retail"; + private static final String DATABASE_NAME = "Hotels"; private static final String COLLECTION_NAME = "hotels_ivf"; private static final String VECTOR_INDEX_NAME = "vectorIndex_ivf"; - - private final AppConfig config = new AppConfig(); - private final ObjectMapper objectMapper = new ObjectMapper(); - + + private final JsonMapper jsonMapper = JsonMapper.builder().build(); + public static void main(String[] args) { new IVF().run(); System.exit(0); } - + public void run() { try (var mongoClient = createMongoClient()) { var openAIClient = createOpenAIClient(); - + var database = mongoClient.getDatabase(DATABASE_NAME); var collection = database.getCollection(COLLECTION_NAME, Document.class); - + // Drop and recreate collection collection.drop(); database.createCollection(COLLECTION_NAME); System.out.println("Created collection: " + COLLECTION_NAME); - + // Load and insert data var hotelData = loadHotelData(); insertDataInBatches(collection, hotelData); - + // Create standard indexes - collection.createIndex(Indexes.ascending("HotelName")); - collection.createIndex(Indexes.ascending("Category")); - + createStandardIndexes(collection); + // Create vector index createVectorIndex(database); - + // Perform vector search var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); performVectorSearch(collection, queryEmbedding); - + } catch (Exception e) { System.err.println("Error: " + e.getMessage()); e.printStackTrace(); } } - + private MongoClient createMongoClient() { - var clusterName = config.get("MONGO_CLUSTER_NAME"); + var clusterName = System.getenv("MONGO_CLUSTER_NAME"); + var managedIdentityPrincipalId = System.getenv("AZURE_MANAGED_IDENTITY_PRINCIPAL_ID"); var azureCredential = new DefaultAzureCredentialBuilder().build(); - + MongoCredential.OidcCallback callback = (MongoCredential.OidcCallbackContext context) -> { var token = azureCredential.getToken( new com.azure.core.credential.TokenRequestContext() .addScopes("https://ossrdbms-aad.database.windows.net/.default") ).block(); - + if (token == null) { throw new RuntimeException("Failed to obtain Azure AD token"); } - + return new MongoCredential.OidcCallbackResult(token.getToken()); }; - + var credential = MongoCredential.createOidcCredential(null) .withMechanismProperty("OIDC_CALLBACK", callback); - + var connectionString = new ConnectionString( - String.format("mongodb+srv://%s.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", clusterName) + String.format("mongodb+srv://%s@%s.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", + managedIdentityPrincipalId, clusterName) ); - + var settings = MongoClientSettings.builder() .applyConnectionString(connectionString) .credential(credential) .build(); - + return MongoClients.create(settings); } - + private OpenAIClient createOpenAIClient() { - var endpoint = config.get("AZURE_OPENAI_EMBEDDING_ENDPOINT"); + var endpoint = System.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"); var credential = new DefaultAzureCredentialBuilder().build(); - + return new OpenAIClientBuilder() .endpoint(endpoint) .credential(credential) .buildClient(); } - - private List loadHotelData() throws IOException { - var dataFile = config.getOrDefault("DATA_FILE_WITH_VECTORS", "HotelsData_toCosmosDB_Vector.json"); + + private List> loadHotelData() throws IOException { + var dataFile = System.getenv("DATA_FILE_WITH_VECTORS"); var filePath = Path.of(dataFile); - + System.out.println("Reading JSON file from " + filePath.toAbsolutePath()); var jsonContent = Files.readString(filePath); - - return objectMapper.readValue(jsonContent, new TypeReference>() {}); + + return jsonMapper.readValue(jsonContent, new TypeReference>>() {}); } - - private void insertDataInBatches(MongoCollection collection, List hotelData) { - var batchSize = config.getIntOrDefault("LOAD_SIZE_BATCH", 100); + + private void insertDataInBatches(MongoCollection collection, List> hotelData) { + var batchSizeStr = System.getenv("LOAD_SIZE_BATCH"); + var batchSize = batchSizeStr != null ? Integer.parseInt(batchSizeStr) : 100; var batches = partitionList(hotelData, batchSize); - + System.out.println("Processing in batches of " + batchSize + "..."); - + for (int i = 0; i < batches.size(); i++) { var batch = batches.get(i); var documents = batch.stream() - .map(this::convertToDocument) + .map(Document::new) .toList(); - + collection.insertMany(documents); System.out.println("Batch " + (i + 1) + " complete: " + documents.size() + " inserted"); } } - - private Document convertToDocument(HotelData hotel) { - try { - var json = objectMapper.writeValueAsString(hotel); - return Document.parse(json); - } catch (Exception e) { - throw new RuntimeException("Failed to convert hotel to document", e); - } + + private void createStandardIndexes(MongoCollection collection) { + collection.createIndex(Indexes.ascending("HotelId")); + collection.createIndex(Indexes.ascending("Category")); + collection.createIndex(Indexes.ascending("Description")); + collection.createIndex(Indexes.ascending("Description_fr")); } - + private void createVectorIndex(MongoDatabase database) { + var embeddedField = System.getenv("EMBEDDED_FIELD"); + var dimensionsStr = System.getenv("EMBEDDING_DIMENSIONS"); + var dimensions = dimensionsStr != null ? Integer.parseInt(dimensionsStr) : 1536; + var indexDefinition = new Document() .append("createIndexes", COLLECTION_NAME) .append("indexes", List.of( new Document() .append("name", VECTOR_INDEX_NAME) - .append("key", new Document("text_embedding_ada_002", "cosmosSearch")) + .append("key", new Document(embeddedField, "cosmosSearch")) .append("cosmosSearchOptions", new Document() .append("kind", "vector-ivf") - .append("dimensions", config.getIntOrDefault("EMBEDDING_DIMENSIONS", 1536)) + .append("dimensions", dimensions) .append("similarity", "COS") - .append("numLists", 1) + .append("numLists", 10) ) )); - + database.runCommand(indexDefinition); System.out.println("Created vector index: " + VECTOR_INDEX_NAME); } - + private List createEmbedding(OpenAIClient openAIClient, String text) { - var model = config.getOrDefault("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"); + var model = System.getenv("AZURE_OPENAI_EMBEDDING_MODEL"); var options = new EmbeddingsOptions(List.of(text)); - + var response = openAIClient.getEmbeddings(model, options); return response.getData().get(0).getEmbedding().stream() .map(Float::doubleValue) .toList(); } - + private void performVectorSearch(MongoCollection collection, List queryEmbedding) { + var embeddedField = System.getenv("EMBEDDED_FIELD"); + var searchStage = new Document("$search", new Document() .append("cosmosSearch", new Document() .append("vector", queryEmbedding) - .append("path", "text_embedding_ada_002") + .append("path", embeddedField) .append("k", 5) ) ); - + var projectStage = new Document("$project", new Document() - .append("HotelName", 1) .append("score", new Document("$meta", "searchScore")) + .append("document", "$$ROOT") ); - + var pipeline = List.of(searchStage, projectStage); - + System.out.println("\nVector search results for: \"" + SAMPLE_QUERY + "\""); - + AggregateIterable results = collection.aggregate(pipeline); var rank = 1; - + for (var result : results) { - var hotelName = result.getString("HotelName"); + var document = result.get("document", Document.class); + var hotelName = document.getString("HotelName"); var score = result.getDouble("score"); System.out.printf("%d. HotelName: %s, Score: %.4f%n", rank++, hotelName, score); } } - + private static List> partitionList(List list, int batchSize) { var partitions = new ArrayList>(); for (int i = 0; i < list.size(); i += batchSize) {