diff --git a/Sources/OpenAI/Intramodular/API/OpenAI.APISpecification.RequestBodies.swift b/Sources/OpenAI/Intramodular/API/OpenAI.APISpecification.RequestBodies.swift index 640e0254..b198b69f 100644 --- a/Sources/OpenAI/Intramodular/API/OpenAI.APISpecification.RequestBodies.swift +++ b/Sources/OpenAI/Intramodular/API/OpenAI.APISpecification.RequestBodies.swift @@ -383,6 +383,78 @@ extension OpenAI.APISpecification.RequestBodies { } } +extension OpenAI.APISpecification.RequestBodies { + public struct CreateSpeech: Codable { + + /// Encapsulates the voices available for audio generation. + /// + /// To get aquinted with each of the voices and listen to the samples visit: + /// [OpenAI Text-to-Speech – Voice Options](https://platform.openai.com/docs/guides/text-to-speech/voice-options) + public enum Voice: String, Codable, CaseIterable { + case alloy + case echo + case fable + case onyx + case nova + case shimmer + } + + public enum ResponseFormat: String, Codable, CaseIterable { + case mp3 + case opus + case aac + case flac + } + + /// The text to generate audio for. The maximum length is 4096 characters. + public let input: String + /// One of the available TTS models: tts-1 or tts-1-hd + public let model: OpenAI.Model + /// The voice to use when generating the audio. Supported voices are alloy, echo, fable, onyx, nova, and shimmer. Previews of the voices are available in the Text to speech guide. + /// https://platform.openai.com/docs/guides/text-to-speech/voice-options + public let voice: Voice + /// The format to audio in. Supported formats are mp3, opus, aac, and flac. + /// Defaults to mp3 + public let responseFormat: ResponseFormat? + /// The speed of the generated audio. Select a value from **0.25** to **4.0**. **1.0** is the default. + /// Defaults to 1 + public let speed: String? + + public enum CodingKeys: String, CodingKey { + case model + case input + case voice + case responseFormat = "response_format" + case speed + } + + public init(model: OpenAI.Model, input: String, voice: Voice, responseFormat: ResponseFormat = .mp3, speed: Double?) { + self.model = model + self.speed = CreateSpeech.normalizedSpeechSpeed(for: speed) + self.input = input + self.voice = voice + self.responseFormat = responseFormat + } + + enum Speed: Double { + case normal = 1.0 + case max = 4.0 + case min = 0.25 + } + + static func normalizedSpeechSpeed(for inputSpeed: Double?) -> String { + guard let inputSpeed else { return "\(Self.Speed.normal.rawValue)" } + let isSpeedOutOfBounds = inputSpeed <= Self.Speed.min.rawValue || Self.Speed.max.rawValue <= inputSpeed + guard !isSpeedOutOfBounds else { + return inputSpeed < Self.Speed.min.rawValue ? "\(Self.Speed.min.rawValue)" : "\(Self.Speed.max.rawValue)" + } + return "\(inputSpeed)" + } + } +} + + + // MARK: - Auxiliary extension OpenAI.APISpecification.RequestBodies.CreateChatCompletion { diff --git a/Sources/OpenAI/Intramodular/API/OpenAI.APISpecification.swift b/Sources/OpenAI/Intramodular/API/OpenAI.APISpecification.swift index 107c89a8..8ab27576 100644 --- a/Sources/OpenAI/Intramodular/API/OpenAI.APISpecification.swift +++ b/Sources/OpenAI/Intramodular/API/OpenAI.APISpecification.swift @@ -73,6 +73,13 @@ extension OpenAI { @Body(json: .input, keyEncodingStrategy: .convertToSnakeCase) public var createChatCompletions = Endpoint() + // MARK: Speech + + @POST + @Path("/v1/audio/speech") + @Body(json: .input, keyEncodingStrategy: .convertToSnakeCase) + public var createSpeech = Endpoint() + // MARK: Threads @Header(["OpenAI-Beta": "assistants=v1"]) @@ -268,10 +275,15 @@ extension OpenAI.APISpecification { throw apiError } - return try response.decode( - Output.self, - keyDecodingStrategy: .convertFromSnakeCase - ) + switch Output.self { + case Data.self: + return try cast(response.data, to: Output.self) + default: + return try response.decode( + Output.self, + keyDecodingStrategy: .convertFromSnakeCase + ) + } } } } diff --git a/Sources/OpenAI/Intramodular/Models/OpenAI.Model.swift b/Sources/OpenAI/Intramodular/Models/OpenAI.Model.swift index d68d2882..5b2835e4 100644 --- a/Sources/OpenAI/Intramodular/Models/OpenAI.Model.swift +++ b/Sources/OpenAI/Intramodular/Models/OpenAI.Model.swift @@ -30,6 +30,7 @@ extension OpenAI { case instructGPT(InstructGPT) case embedding(Embedding) case chat(Chat) + case speech(Speech) /// Deprecated by OpenAI. case feature(Feature) @@ -58,6 +59,8 @@ extension OpenAI { return value case .chat(let value): return value + case .speech(let value): + return value case .unknown: assertionFailure(.unimplemented) @@ -245,40 +248,41 @@ extension OpenAI.Model { public var contextSize: Int { let _4k = 4096 let _8k = 8192 - let _16k = 16384 - let _32k = 16384 + let _16k = 16385 + let _32k = 32768 + let _128k = 128000 // let _128k = 131072 + return switch self { + case .gpt_3_5_turbo, .gpt_3_5_turbo_0125, .gpt_3_5_turbo_16k: _16k + case .gpt_4: _8k + case .gpt_4_32k: _32k + case .gpt_3_5_turbo_0301, .gpt_3_5_turbo_0613: _4k + case .gpt_3_5_turbo_16k_0613: _16k + case .gpt_4_0314: _8k + case .gpt_4_0613: _8k + case .gpt_4_32k_0314: _32k + case .gpt_4_32k_0613: _32k + case .gpt_4_1106_preview, .gpt_4_0125_preview: _128k + case .gpt_4_vision_preview: _128k + case .gpt_4_turbo_preview: _128k + } + } + } +} + +extension OpenAI.Model { + public enum Speech: String, Named, OpenAI._ModelType, CaseIterable { + case tts_1 = "tts-1" + case tts_1_hd = "tts-1-hd" + + public var contextSize: Int { return 4096 } + + public var name: String { switch self { - case .gpt_3_5_turbo: - return _4k - case .gpt_3_5_turbo_16k: - return _16k - case .gpt_4: - return _8k - case .gpt_4_32k: - return _32k - case .gpt_3_5_turbo_0301, .gpt_3_5_turbo_0613, .gpt_3_5_turbo_0125: - return _4k - case .gpt_3_5_turbo_16k_0613: - return _16k - case .gpt_4_0314: - return _8k - case .gpt_4_0613: - return _8k - case .gpt_4_32k_0314: - return _32k - case .gpt_4_32k_0613: - return _32k - case .gpt_4_1106_preview, .gpt_4_0125_preview: - return 4096 // FIXME: !!! - case .gpt_4_vision_preview: - return 4096 // FIXME: !!! - case .gpt_4_turbo: - return 4096 // FIXME: !!! - case .__deprecated_gpt_4_turbo_preview: - return 4096 // FIXME: !!! + case .tts_1: "Text-to-speech" + case .tts_1_hd: "Text-to-speech HD" } } } @@ -331,6 +335,8 @@ extension OpenAI.Model: RawRepresentable { return model.rawValue case .chat(let model): return model.rawValue + case .speech(let model): + return model.rawValue case .unknown(let rawValue): return rawValue } diff --git a/Sources/OpenAI/Intramodular/Models/OpenAI.Object.swift b/Sources/OpenAI/Intramodular/Models/OpenAI.Object.swift index 3eeca1b7..0301a254 100644 --- a/Sources/OpenAI/Intramodular/Models/OpenAI.Object.swift +++ b/Sources/OpenAI/Intramodular/Models/OpenAI.Object.swift @@ -12,6 +12,7 @@ extension OpenAI { case textCompletion = "text_completion" case chatCompletion = "chat.completion" case chatCompletionChunk = "chat.completion.chunk" + case speech = "speech" case file = "file" case thread = "thread" case message = "thread.message" @@ -31,6 +32,8 @@ extension OpenAI { return OpenAI.ChatCompletion.self case .chatCompletionChunk: return OpenAI.ChatCompletionChunk.self + case .speech: + return OpenAI.Speech.self case .file: return OpenAI.File.self case .thread: diff --git a/Sources/OpenAI/Intramodular/Models/OpenAI.Speech.swift b/Sources/OpenAI/Intramodular/Models/OpenAI.Speech.swift new file mode 100644 index 00000000..e32dd551 --- /dev/null +++ b/Sources/OpenAI/Intramodular/Models/OpenAI.Speech.swift @@ -0,0 +1,31 @@ +// +// OpenAI.Speech.swift +// graph +// +// Created by Purav Manot on 10/03/24. +// + +import Foundation + +extension OpenAI { + public final class Speech: OpenAI.Object { + public let data: Data + + public required init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + + self.data = try container.decode(forKey: .data) + + try super.init(from: decoder) + } + + public init(data: Data) { + self.data = data + super.init(type: .speech) + } + + enum CodingKeys: CodingKey { + case data + } + } +} diff --git a/Sources/OpenAI/Intramodular/OpenAI.APIClient.swift b/Sources/OpenAI/Intramodular/OpenAI.APIClient.swift index ea76a1a6..52661d0a 100644 --- a/Sources/OpenAI/Intramodular/OpenAI.APIClient.swift +++ b/Sources/OpenAI/Intramodular/OpenAI.APIClient.swift @@ -136,6 +136,38 @@ extension OpenAI.APIClient { } extension OpenAI.APIClient { + public func createSpeech( + model: OpenAI.Model, + text: String, + voice: OpenAI.APISpecification.RequestBodies.CreateSpeech.Voice = .alloy, + speed: Double? + ) async throws -> OpenAI.Speech { + let requestBody = OpenAI.APISpecification.RequestBodies.CreateSpeech( + model: model, + input: text, + voice: voice, + speed: speed + ) + let data = try await run(\.createSpeech, with: requestBody) + return OpenAI.Speech(data: data) + } + + public func createSpeech( + model: OpenAI.Model.Speech, + text: String, + voice: OpenAI.APISpecification.RequestBodies.CreateSpeech.Voice = .alloy, + speed: Double? + ) async throws -> OpenAI.Speech { + let requestBody = OpenAI.APISpecification.RequestBodies.CreateSpeech( + model: OpenAI.Model.speech(model), + input: text, + voice: voice, + speed: speed + ) + let data = try await run(\.createSpeech, with: requestBody) + return OpenAI.Speech(data: data) + } + public func uploadFile( _ file: URL, named filename: String? = nil,