From 684843cee55a7eb42a46ed51bc1279bc0122c8c3 Mon Sep 17 00:00:00 2001 From: Rudrank Riyam Date: Wed, 30 Oct 2024 00:18:06 +0900 Subject: [PATCH] Add new batch scrape API --- Sources/AgniKit/AgniKit.swift | 120 +++++++++++++++++++++- Sources/AgniKit/BatchScrapeResponse.swift | 64 ++++++++++++ Tests/AgniKitTests/AgniKitTests.swift | 2 +- 3 files changed, 184 insertions(+), 2 deletions(-) create mode 100644 Sources/AgniKit/BatchScrapeResponse.swift diff --git a/Sources/AgniKit/AgniKit.swift b/Sources/AgniKit/AgniKit.swift index e6ef656..fc4eb72 100644 --- a/Sources/AgniKit/AgniKit.swift +++ b/Sources/AgniKit/AgniKit.swift @@ -286,4 +286,122 @@ public struct AgniKit { return result } -} + + /// Performs a synchronous batch scrape operation + /// + /// This method sends a POST request to scrape multiple URLs simultaneously and waits for all results. + /// + /// - Parameters: + /// - urls: Array of URLs to scrape + /// - formats: Array of desired output formats (e.g. ["markdown", "html"]). Default is ["markdown"] + /// - onlyMainContent: Whether to return only the main content. Default is true + /// - timeout: Timeout in milliseconds. Default is 30000 + /// + /// - Returns: A BatchScrapeResponse containing all scraped results + /// + /// - Throws: An error if the request fails or if the response cannot be decoded + public func batchScrape( + urls: [String], + formats: [String] = ["markdown"], + onlyMainContent: Bool = true, + timeout: Int = 30000 + ) async throws -> BatchScrapeResponse { + var request = makeRequest(for: "v1/batch/scrape") + request.httpMethod = "POST" + request.setValue("application/json", forHTTPHeaderField: "Content-Type") + + let body: [String: Any] = [ + "urls": urls, + "formats": formats, + "onlyMainContent": onlyMainContent, + "timeout": timeout + ] + + request.httpBody = try JSONSerialization.data(withJSONObject: body) + + let (data, response) = try await URLSession.shared.data(for: request) + + guard let httpResponse = response as? HTTPURLResponse else { + throw NSError(domain: "AgniKit", code: 0, userInfo: [NSLocalizedDescriptionKey: "Invalid response"]) + } + + guard httpResponse.statusCode == 200 else { + throw NSError(domain: "AgniKit", code: httpResponse.statusCode, + userInfo: [NSLocalizedDescriptionKey: "HTTP error \(httpResponse.statusCode)"]) + } + + let decoder = JSONDecoder() + decoder.dateDecodingStrategy = .iso8601 + return try decoder.decode(BatchScrapeResponse.self, from: data) + } + + /// Creates an asynchronous batch scrape job + /// + /// This method initiates a batch scrape job and returns immediately with a job ID + /// + /// - Parameters: + /// - urls: Array of URLs to scrape + /// - formats: Array of desired output formats (e.g. ["markdown", "html"]). Default is ["markdown"] + /// - onlyMainContent: Whether to return only the main content. Default is true + /// + /// - Returns: A BatchScrapeJobResponse containing the job ID and status URL + /// + /// - Throws: An error if the request fails or if the response cannot be decoded + public func createBatchScrapeJob( + urls: [String], + formats: [String] = ["markdown"], + onlyMainContent: Bool = true + ) async throws -> BatchScrapeJobResponse { + var request = makeRequest(for: "v1/batch/scrape") + request.httpMethod = "POST" + request.setValue("application/json", forHTTPHeaderField: "Content-Type") + + let body: [String: Any] = [ + "urls": urls, + "formats": formats, + "onlyMainContent": onlyMainContent, + "async": true + ] + + request.httpBody = try JSONSerialization.data(withJSONObject: body) + + let (data, response) = try await URLSession.shared.data(for: request) + + guard let httpResponse = response as? HTTPURLResponse else { + throw NSError(domain: "AgniKit", code: 0, userInfo: [NSLocalizedDescriptionKey: "Invalid response"]) + } + + guard httpResponse.statusCode == 200 else { + throw NSError(domain: "AgniKit", code: httpResponse.statusCode, + userInfo: [NSLocalizedDescriptionKey: "HTTP error \(httpResponse.statusCode)"]) + } + + return try JSONDecoder().decode(BatchScrapeJobResponse.self, from: data) + } + + /// Checks the status of an asynchronous batch scrape job + /// + /// - Parameter id: The job ID returned from createBatchScrapeJob + /// + /// - Returns: A BatchScrapeResponse containing the current status and any completed results + /// + /// - Throws: An error if the request fails or if the response cannot be decoded + public func getBatchScrapeStatus(id: String) async throws -> BatchScrapeResponse { + let request = makeRequest(for: "v1/batch/scrape/\(id)") + + let (data, response) = try await URLSession.shared.data(for: request) + + guard let httpResponse = response as? HTTPURLResponse else { + throw NSError(domain: "AgniKit", code: 0, userInfo: [NSLocalizedDescriptionKey: "Invalid response"]) + } + + guard httpResponse.statusCode == 200 else { + throw NSError(domain: "AgniKit", code: httpResponse.statusCode, + userInfo: [NSLocalizedDescriptionKey: "HTTP error \(httpResponse.statusCode)"]) + } + + let decoder = JSONDecoder() + decoder.dateDecodingStrategy = .iso8601 + return try decoder.decode(BatchScrapeResponse.self, from: data) + } +} \ No newline at end of file diff --git a/Sources/AgniKit/BatchScrapeResponse.swift b/Sources/AgniKit/BatchScrapeResponse.swift new file mode 100644 index 0000000..57665d9 --- /dev/null +++ b/Sources/AgniKit/BatchScrapeResponse.swift @@ -0,0 +1,64 @@ +import Foundation + +/// Represents the response from a batch scrape operation +public struct BatchScrapeResponse: Codable { + /// The current status of the batch scrape job + public let status: String + + /// Total number of URLs in the batch + public let total: Int + + /// Number of completed URL scrapes + public let completed: Int + + /// Number of API credits used for this operation + public let creditsUsed: Int + + /// Timestamp when the results will expire + public let expiresAt: Date + + /// Array of scraped results for each URL + public let data: [BatchScrapeResult] +} + +/// Represents the scraped result for a single URL in a batch +public struct BatchScrapeResult: Codable { + /// The scraped content in markdown format (if requested) + public let markdown: String? + + /// The scraped content in HTML format (if requested) + public let html: String? + + /// Metadata about the scraped page + public let metadata: BatchScrapeMetadata +} + +/// Metadata associated with a scraped page +public struct BatchScrapeMetadata: Codable { + /// Title of the webpage + public let title: String + + /// Detected language of the content + public let language: String + + /// Original URL that was scraped + public let sourceURL: String + + /// Meta description of the webpage + public let description: String + + /// HTTP status code of the response + public let statusCode: Int +} + +/// Response for an asynchronous batch scrape job creation +public struct BatchScrapeJobResponse: Codable { + /// Whether the job was successfully created + public let success: Bool + + /// The unique identifier for the batch scrape job + public let id: String + + /// URL to check the job status + public let url: String +} \ No newline at end of file diff --git a/Tests/AgniKitTests/AgniKitTests.swift b/Tests/AgniKitTests/AgniKitTests.swift index bb78902..ceefe53 100644 --- a/Tests/AgniKitTests/AgniKitTests.swift +++ b/Tests/AgniKitTests/AgniKitTests.swift @@ -14,7 +14,7 @@ import Foundation /// It verifies that the scraped content contains expected elements from the Apple documentation. @Test("Scrape Apple's Defining Tests documentation") func testWebScraping() async throws { - let agniKit = AgniKit(apiKey: "no") + let agniKit = AgniKit(apiKey: "fc-") let url = "https://developer.apple.com/documentation/testing/definingtests"