Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add TTS support #7

Merged
merged 4 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,78 @@ extension OpenAI.APISpecification.RequestBodies {
}
}

extension OpenAI.APISpecification.RequestBodies {
public struct CreateSpeech: Codable {

/// Encapsulates the voices available for audio generation.
///
/// To get aquinted with each of the voices and listen to the samples visit:
/// [OpenAI Text-to-Speech – Voice Options](https://platform.openai.com/docs/guides/text-to-speech/voice-options)
public enum Voice: String, Codable, CaseIterable {
case alloy
case echo
case fable
case onyx
case nova
case shimmer
}

public enum ResponseFormat: String, Codable, CaseIterable {
case mp3
case opus
case aac
case flac
}

/// The text to generate audio for. The maximum length is 4096 characters.
public let input: String
/// One of the available TTS models: tts-1 or tts-1-hd
public let model: OpenAI.Model
/// The voice to use when generating the audio. Supported voices are alloy, echo, fable, onyx, nova, and shimmer. Previews of the voices are available in the Text to speech guide.
/// https://platform.openai.com/docs/guides/text-to-speech/voice-options
public let voice: Voice
/// The format to audio in. Supported formats are mp3, opus, aac, and flac.
/// Defaults to mp3
public let responseFormat: ResponseFormat?
/// The speed of the generated audio. Select a value from **0.25** to **4.0**. **1.0** is the default.
/// Defaults to 1
public let speed: String?

public enum CodingKeys: String, CodingKey {
case model
case input
case voice
case responseFormat = "response_format"
case speed
}

public init(model: OpenAI.Model, input: String, voice: Voice, responseFormat: ResponseFormat = .mp3, speed: Double?) {
self.model = model
self.speed = CreateSpeech.normalizedSpeechSpeed(for: speed)
self.input = input
self.voice = voice
self.responseFormat = responseFormat
}

enum Speed: Double {
case normal = 1.0
case max = 4.0
case min = 0.25
}

static func normalizedSpeechSpeed(for inputSpeed: Double?) -> String {
guard let inputSpeed else { return "\(Self.Speed.normal.rawValue)" }
let isSpeedOutOfBounds = inputSpeed <= Self.Speed.min.rawValue || Self.Speed.max.rawValue <= inputSpeed
guard !isSpeedOutOfBounds else {
return inputSpeed < Self.Speed.min.rawValue ? "\(Self.Speed.min.rawValue)" : "\(Self.Speed.max.rawValue)"
}
return "\(inputSpeed)"
}
}
}



// MARK: - Auxiliary

extension OpenAI.APISpecification.RequestBodies.CreateChatCompletion {
Expand Down
20 changes: 16 additions & 4 deletions Sources/OpenAI/Intramodular/API/OpenAI.APISpecification.swift
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ extension OpenAI {
@Body(json: .input, keyEncodingStrategy: .convertToSnakeCase)
public var createChatCompletions = Endpoint<RequestBodies.CreateChatCompletion, OpenAI.ChatCompletion, Void>()

// MARK: Speech

@POST
@Path("/v1/audio/speech")
@Body(json: .input, keyEncodingStrategy: .convertToSnakeCase)
public var createSpeech = Endpoint<RequestBodies.CreateSpeech, Data, Void>()

// MARK: Threads

@Header(["OpenAI-Beta": "assistants=v1"])
Expand Down Expand Up @@ -218,10 +225,15 @@ extension OpenAI.APISpecification {
throw apiError
}

return try response.decode(
Output.self,
keyDecodingStrategy: .convertFromSnakeCase
)
switch Output.self {
case Data.self:
return try cast(response.data, to: Output.self)
default:
return try response.decode(
Output.self,
keyDecodingStrategy: .convertFromSnakeCase
)
}
}
}
}
112 changes: 52 additions & 60 deletions Sources/OpenAI/Intramodular/Models/OpenAI.Model.swift
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ extension OpenAI {
case instructGPT(InstructGPT)
case embedding(Embedding)
case chat(Chat)
case speech(Speech)

/// Deprecated by OpenAI.
case feature(Feature)
Expand Down Expand Up @@ -58,6 +59,8 @@ extension OpenAI {
return value
case .chat(let value):
return value
case .speech(let value):
return value
case .unknown:
assertionFailure(.unimplemented)

Expand Down Expand Up @@ -203,76 +206,63 @@ extension OpenAI.Model {

public var name: String {
switch self {
case .gpt_3_5_turbo:
return "ChatGPT 3.5"
case .gpt_3_5_turbo_16k:
return "ChatGPT 3.5"
case .gpt_4:
return "ChatGPT 4"
case .gpt_4_32k:
return "ChatGPT 4"
case .gpt_4_1106_preview:
return "GPT-4 Turbo"
case .gpt_4_0125_preview:
return "GPT-4 Turbo"
case .gpt_4_vision_preview:
return "GPT-4V"
case .gpt_3_5_turbo_0301:
return "GPT-3.5"
case .gpt_3_5_turbo_0613:
return "GPT-3.5"
case .gpt_3_5_turbo_0125:
return "GPT-3.5"
case .gpt_3_5_turbo_16k_0613:
return "GPT-3.5"
case .gpt_4_0314:
return "GPT-4"
case .gpt_4_0613:
return "GPT-4"
case .gpt_4_32k_0314:
return "GPT-4"
case .gpt_4_32k_0613:
return "GPT-4"
case .gpt_4_turbo_preview:
return "GPT-4 Turbo (Preview)"
case .gpt_3_5_turbo: "ChatGPT 3.5"
case .gpt_3_5_turbo_16k: "ChatGPT 3.5"
case .gpt_4: "ChatGPT 4"
case .gpt_4_32k: "ChatGPT 4"
case .gpt_4_1106_preview: "GPT-4 Turbo"
case .gpt_4_0125_preview: "GPT-4 Turbo"
case .gpt_4_vision_preview: "GPT-4V"
case .gpt_3_5_turbo_0301: "GPT-3.5"
case .gpt_3_5_turbo_0613: "GPT-3.5"
case .gpt_3_5_turbo_0125: "GPT-3.5"
case .gpt_3_5_turbo_16k_0613: "GPT-3.5"
case .gpt_4_0314: "GPT-4"
case .gpt_4_0613: "GPT-4"
case .gpt_4_32k_0314: "GPT-4"
case .gpt_4_32k_0613: "GPT-4"
case .gpt_4_turbo_preview: "GPT-4 Turbo (Preview)"
}
}

public var contextSize: Int {
let _4k = 4096
let _8k = 8192
let _16k = 16384
let _32k = 16384
let _16k = 16385
let _32k = 32768
let _128k = 128000

// let _128k = 131072

return switch self {
case .gpt_3_5_turbo, .gpt_3_5_turbo_0125, .gpt_3_5_turbo_16k: _16k
case .gpt_4: _8k
case .gpt_4_32k: _32k
case .gpt_3_5_turbo_0301, .gpt_3_5_turbo_0613: _4k
case .gpt_3_5_turbo_16k_0613: _16k
case .gpt_4_0314: _8k
case .gpt_4_0613: _8k
case .gpt_4_32k_0314: _32k
case .gpt_4_32k_0613: _32k
case .gpt_4_1106_preview, .gpt_4_0125_preview: _128k
case .gpt_4_vision_preview: _128k
case .gpt_4_turbo_preview: _128k
}
}
}
}

extension OpenAI.Model {
public enum Speech: String, Named, OpenAI._ModelType, CaseIterable {
case tts_1 = "tts-1"
case tts_1_hd = "tts-1-hd"

public var contextSize: Int { return 4096 }

public var name: String {
switch self {
case .gpt_3_5_turbo:
return _4k
case .gpt_3_5_turbo_16k:
return _16k
case .gpt_4:
return _8k
case .gpt_4_32k:
return _32k
case .gpt_3_5_turbo_0301, .gpt_3_5_turbo_0613, .gpt_3_5_turbo_0125:
return _4k
case .gpt_3_5_turbo_16k_0613:
return _16k
case .gpt_4_0314:
return _8k
case .gpt_4_0613:
return _8k
case .gpt_4_32k_0314:
return _32k
case .gpt_4_32k_0613:
return _32k
case .gpt_4_1106_preview, .gpt_4_0125_preview:
return 4096 // FIXME: !!!
case .gpt_4_vision_preview:
return 4096 // FIXME: !!!
case .gpt_4_turbo_preview:
return 4096 // FIXME: !!!
case .tts_1: "Text-to-speech"
case .tts_1_hd: "Text-to-speech HD"
}
}
}
Expand Down Expand Up @@ -325,6 +315,8 @@ extension OpenAI.Model: RawRepresentable {
return model.rawValue
case .chat(let model):
return model.rawValue
case .speech(let model):
return model.rawValue
case .unknown(let rawValue):
return rawValue
}
Expand Down
3 changes: 3 additions & 0 deletions Sources/OpenAI/Intramodular/Models/OpenAI.Object.swift
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ extension OpenAI {
case textCompletion = "text_completion"
case chatCompletion = "chat.completion"
case chatCompletionChunk = "chat.completion.chunk"
case speech = "speech"
case thread = "thread"
case message = "thread.message"
case assistant = "assistant"
Expand All @@ -30,6 +31,8 @@ extension OpenAI {
return OpenAI.ChatCompletion.self
case .chatCompletionChunk:
return OpenAI.ChatCompletionChunk.self
case .speech:
return OpenAI.Speech.self
case .thread:
return OpenAI.Thread.self
case .message:
Expand Down
31 changes: 31 additions & 0 deletions Sources/OpenAI/Intramodular/Models/OpenAI.Speech.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
//
// OpenAI.Speech.swift
// graph
//
// Created by Purav Manot on 10/03/24.
//

import Foundation

extension OpenAI {
public final class Speech: OpenAI.Object {
public let data: Data

public required init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)

self.data = try container.decode(forKey: .data)

try super.init(from: decoder)
}

public init(data: Data) {
self.data = data
super.init(type: .speech)
}

enum CodingKeys: CodingKey {
case data
}
}
}
35 changes: 35 additions & 0 deletions Sources/OpenAI/Intramodular/OpenAI.APIClient.swift
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,41 @@ extension OpenAI.APIClient {
}
}

extension OpenAI.APIClient {
public func createSpeech(
model: OpenAI.Model,
text: String,
voice: OpenAI.APISpecification.RequestBodies.CreateSpeech.Voice = .alloy,
speed: Double?
) async throws -> OpenAI.Speech {
let requestBody = OpenAI.APISpecification.RequestBodies.CreateSpeech(
model: model,
input: text,
voice: voice,
speed: speed
)
let data = try await run(\.createSpeech, with: requestBody)
return OpenAI.Speech(data: data)
}

public func createSpeech(
model: OpenAI.Model.Speech,
text: String,
voice: OpenAI.APISpecification.RequestBodies.CreateSpeech.Voice = .alloy,
speed: Double?
) async throws -> OpenAI.Speech {
let requestBody = OpenAI.APISpecification.RequestBodies.CreateSpeech(
model: OpenAI.Model.speech(model),
input: text,
voice: voice,
speed: speed
)
let data = try await run(\.createSpeech, with: requestBody)
return OpenAI.Speech(data: data)
}

}

extension OpenAI.APIClient {
@discardableResult
public func createRun(
Expand Down
Loading