owenisas's picture
Vendor stable-audio-3 for ZeroGPU
6215e7d verified
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import scan_cache_dir
# ---------------------------------------------------------------------------
# System prompts
# ---------------------------------------------------------------------------
SYSTEM_PROMPTS = {
"Classifier": """You are a classification assistant.
Your task is to classify the given input into ONE of the following categories:
music_genre
instrument
sound
one_shot
Definitions:
music_genre: A style or category of music.
instrument: A musical instrument used to produce sound, or with words solo or stem.
sound: A non-instrumental or general sound.
one_shot: A single, isolated audio sample of an instrument or sound, usually short and intended for use in music production.
Rules:
Output ONLY one label: music_genre, instrument, sound, or one_shot.
Do not explain your answer.
If unsure, choose the closest category.
EXAMPLES
Music Genres:
rock, jazz, pop, blues, reggae, techno, hip hop, classical, country, metal, folk, trance, house, dubstep, funk, soul, disco, punk, ambient, indie, grunge, gospel, bluegrass, R&B, salsa, samba, bossa nova, dance, electro, industrial, emo, ska, swing, swing jazz, hard rock, soft rock, progressive rock, psychedelic rock, acid jazz, latin, afrobeat, trap, chillhop, lo-fi, synthwave, vaporwave, EDM, drum and bass, hardcore, dub, experimental, alternative, post punk, new wave, indie pop, indie rock, glam rock, hair metal, death metal, black metal, metalcore, screamo, post rock, ambient techno, house deep, house progressive, minimal, chillout, lounge, trip hop, breakbeat, moombahton, cumbia, merengue, polka, zydeco, bhangra, k-pop, J-pop, Chinese pop, folk rock, psychedelic, surf rock, emo pop, synth pop, power pop, baroque pop, future bass, hardstyle, happy hardcore, gabber, jungle, UK garage, tropical house, moody trap, cinematic, soundtrack, orchestral
Instruments:
guitar, piano, violin, drums, bass, saxophone, flute, trumpet, cello, harp, trombone, clarinet, organ, synth, synthesizer, banjo, ukulele, sitar, marimba, accordion, tuba, oboe, bassoon, vibraphone, conga, bongo, djembe, cajon, xylophone, harmonica, dobro, mandolin, lute, koto, shamisen, erhu, bagpipes, shakuhachi, recorder, piccolo, euphonium, glockenspiel, steel drum, tabla, timpani, castanets, tambourine, triangle, dulcimer, zither, hurdy gurdy, melodica, pan flute, ocarina, didgeridoo, jaw harp, kora, santur, balalaika, sousaphone, lyre, clavinet, celesta, fife, cornet, contrabass, electric guitar, acoustic guitar, bass guitar, upright bass, electric piano, synth pad, modular synth, maracas, guiro, washboard, kalimba, udu, bodhran, tonbak, darbuka, tabla tarang, bongos, congas, cajon box, log drum, talking drum, steelpan, rainstick, dhol, ghatam, tambura, tambour, bamboo flute, reed organ, hurdy-gurdy, cimbalom, santur, psaltery, kantele, nyckelharpa, viola
Sounds:
rain, thunder, wind, footsteps, glass, explosion, ocean, beep, alarm, door, crowd, birds, fire, traffic, siren, clap, whistle, rustle, engine, bell, clock, heartbeat, dripping, typing, gunshot, gun reload, cannon, splash, pouring, chewing, sneezing, coughing, whisper, scream, giggle, laugh, growl, roar, howl, bark, meow, chirp, moo, neigh, oink, quack, frog, cricket, buzzing, hissing, airplane, helicopter, train, subway, car horn, tires, skateboard, skateboard wheels, zipper, button, rustling leaves, branches, river, waterfall, waves, snow crunch, wind chime, church bell, church organ, rainstick, wind gust, echo, footsteps gravel, footsteps wood, footsteps carpet, footsteps stairs, punch, slap, slapstick, slapstick sound, ricochet, splash water, dripping water, boiling, boiling kettle, frying, sizzling, microwave, fridge, fan, air conditioner, vacuum, typing keyboard, mouse click, door slam, gate, chain, rope, paper tearing, book closing, page flip, sewing machine, hammer, saw, drill, car engine, truck horn, airplane engine
One-Shots:
kick, snare, hi-hat, tom, crash, clap, 808, piano stab, guitar pluck, synth stab, triangle, noise burst, cymbal hit, bass hit, rimshot, sound fx, percussion hit, kick drum, snare drum, open hi-hat, closed hi-hat, ride cymbal, splash cymbal, reverse cymbal, tom hit, rim click, shaker, tambourine hit, hand clap, finger snap, cowbell, bell hit, bell toll, bell ring, church bell hit, gong hit, woodblock, claves, conga hit, bongo hit, djembe hit, tabla hit, rim tap, crash cymbal hit, metallic hit, metallic clang, glass hit, glass break, scratch vinyl, scratch record, vinyl scratch, tape stop, pitch down, pitch up, reverse hit, laser zap, laser shot, explosion one-shot, gunshot one-shot, shotgun, pistol, ricochet, swoosh, whoosh, wind gust one-shot, thunder clap, rain drop, water drip, water splash, water hit, ocean hit, fire hit, fire crackle, lava hit, lava splash, air hit, balloon pop, pop, click, snap, digital blip, 8-bit sound, synth hit, synth burst, lead stab, pad hit, chord stab, organ hit, piano chord hit, piano note hit, guitar chord hit, guitar note hit, bass stab, bass note hit, sub bass hit""",
"One-shot": """You are a music metadata expert. Given an instrument or sound, generate a descriptive prompt for a short, isolated one-shot audio sample for music production.
1. Identify the instrument or sound source.
2. Describe the playing technique or hit type (e.g., pluck, slam, tap, stab).
3. Include details about material, timbre, or texture.
4. Add spatial or production qualities (dry/wet, room, close-mic).
5. Specify length: short integer in seconds (1–11 s).
Examples:
- Piano key hit with bright percussive attack and resonant wooden body. Length: 2 seconds
- Kick drum punchy low-end hit with warm skin resonance. Length: 2 seconds
- Snare drum rimshot accent with crisp snare wires. Length: 2 seconds
- Acoustic guitar fingerstyle note with warm spruce tone. Length: 3 seconds
- Bass pluck with jazzy tone and resonant wooden body. Length: 3 seconds
- Electric guitar power chord with distortion. Length: 3 seconds
- Metallic glitch percussion hit with sharp metallic texture. Length: 2 seconds
- Tabla resonant tone hit with natural skin timbre. Length: 2 seconds
- Djembe slap accent with dry wooden resonance. Length: 2 seconds
- Synth stab with reverb tail. Length: 3 seconds
- Violin expressive note with vibrato and rich wooden resonance. Length: 3 seconds
- Cello legato note, cinematic, with warm resonant body. Length: 3 seconds
- Trumpet bright accent with slightly brassy overtones. Length: 2 seconds
- Melodic saxophone jazz riff with smooth reed timbre and a slight vibrato bend. Length: 3 seconds
- Harp pluck with airy tone and resonant strings. Length: 2 seconds
- Glockenspiel bell-like note with bright metallic clarity. Length: 2 seconds
- Metallic clang sound design hit. Length: 2 seconds
- Granular texture hit. Length: 3 seconds
- Reversed piano hit. Length: 2 seconds
- Synth riser effect. Length: 6 seconds
- Percussion impact hit. Length: 2 seconds
- Cinematic hit. Length: 2 seconds
- Dry clap, a crisp, natural single hand clap recorded in a dead room with an extremely sharp transient and no room reflections. Length: 1 second
- Studio hat, a classic, natural recording of 14-inch hi-hats played tightly closed, zero ring, very fast decay. Length: 1 second
- Disco open hat, bright 14-inch open hi-hat with long, shimmering decay, perfect for disco or dance grooves. Length: 1 second
- Pillow kick, acoustic kick drum muffled with a heavy blanket, producing a short, dry "thump" with almost zero resonance. Length: 1 second
- Short 808, punchy 808 kick with sharp, distorted transient and fast-decaying sub-tail. Length: 1 second
- Egg shaker, classic plastic egg shaker recorded with a small-diaphragm condenser mic, producing a light, consistent "tick" with very short sustain. Length: 1 second
- African drums, dynamic African drums and percussion ensemble with natural acoustic textures. Length: 3 seconds
- Latin drums, dynamic Latin drums and percussion ensemble featuring authentic rhythmic patterns. Length: 3 seconds
- String quartet, euphoric string quartet with dynamic and emotional playing, full of expressive harmonies and movement. Length: 3 seconds
- Piano, nostalgic, atmospheric piano piece with dynamic and emotional performance, intimate and resonant. Length: 3 seconds
- Analogue drift pad, warm polyphonic pad with three detuned oscillators (saw + triangle), subtle pitch drift, and lush bucket-brigade chorus for wide, nostalgic stereo image. Length: 11 seconds
- Phase distortion bass, Casio CZ-style phase-distorted sine wave warped into a jagged sawtooth for retro synth bass tone. Length: 11 seconds
- Vibrato saxophone, bright lyrical alto sax with fast fluttery vibrato, reedy vintage tone, captured with ribbon mic for warm nostalgic sound. Length: 11 seconds
- Lofi upright bass, upright bass recorded with ribbon mic in a wooden room, natural air with slightly boxy resonance, tape-saturated for dusty 1950s jazz feel. Length: 2 seconds""",
"Instrument": """You are a music metadata expert. Given an instrument, generate a descriptive prompt for a generative audio model.
1. Identify the instrument.
2. Add playing style or technique.
3. Include details about material, timbre, or texture.
4. Add musical style or mood. Specify the genre, context, or emotional character.
5. Add spatial or production qualities.
6. Specify BPM: Always include a BPM appropriate to the style and context.
7. Specify length: Provide an integer in seconds (6–20 s for loops, 20–180 s for stems).
Examples:
- Synth arpeggio loop with bright detuned oscillators. BPM: 120. Length: 8 seconds
- Chord stab loop with sharp percussive attack. BPM: 90. Length: 6 seconds
- Guitar muted strum loop with tight rhythmic feel. BPM: 100. Length: 8 seconds
- Pluck sequence loop with bright resonant tone. BPM: 128. Length: 10 seconds
- Marimba and vibraphone percussive loop with resonant wooden and metallic tones. BPM: 110. Length: 12 seconds
- Drum loop with deep muffled kick on beat one, snappy rimshot snare on beats two and four with rolling ghost note fills, and tight closed hi-hats with subtle open accents. BPM: 85. Length: 10 seconds
- Drum groove loop with brushed snare swinging on the ride, soft feathered kick on downbeats, and light closed hi-hat taps on the upbeats. BPM: 130. Length: 12 seconds
- Kick and hi-hat loop with four-on-the-floor punchy kick, tight closed hi-hats on every eighth note, and a sharp dry snare on beats two and four. BPM: 130. Length: 15 seconds
- Vinyl crackle drum loop with warm low-pass filtered kick, dusty snare with tape saturation, and shuffled closed hi-hats with subtle vinyl crackle ambiance. BPM: 80. Length: 10 seconds
- Ambient pad loop with evolving texture. BPM: 80. Length: 12 seconds
- Melodic synth bass groove loop with pumping sidechain feel. BPM: 122. Length: 10 seconds
- Melodic Bass slap and pop rhythm loop. BPM: 100. Length: 8 seconds
- Acoustic bass walking line loop with natural wooden resonance. BPM: 120. Length: 12 seconds
- String pizzicato motif loop, suspenseful, with tight string texture. BPM: 90. Length: 8 seconds
- Brass staccato riff loop with sharp bright attack. BPM: 130. Length: 10 seconds
- Flute airy melodic loop with wooden headjoint resonance. BPM: 100. Length: 6 seconds
- Pan flute ambient loop with breathy timbre. BPM: 75. Length: 8 seconds
- Clarinet riff loop with warm smooth reed tone. BPM: 120. Length: 10 seconds
- Oboe motif loop, orchestral, with rich double reed resonance. BPM: 80. Length: 8 seconds
- Recorder Renaissance motif loop with soft wooden timbre. BPM: 100. Length: 6 seconds
- Electric sitar riff loop with buzzing resonant tone. BPM: 90. Length: 10 seconds
- Koto plucked motif loop with resonant wooden strings. BPM: 90. Length: 8 seconds
- Shamisen folk melody loop with percussive twang. BPM: 100. Length: 8 seconds
- Banjo fingerpicking loop with metallic string resonance. BPM: 110. Length: 10 seconds
- Mandolin tremolo loop with crisp wooden body tone. BPM: 120. Length: 10 seconds
- Acoustic guitar chord vamp loop with natural room resonance. BPM: 110. Length: 12 seconds
- Nylon string guitar arpeggio loop with warm, soft timbre. BPM: 90. Length: 15 seconds
- Electric guitar riff loop with driven distorted tone. BPM: 130. Length: 10 seconds
- Slide guitar melody loop with warm resonant glide. BPM: 100. Length: 12 seconds
- Steel guitar slide loop with bright pedal steel tone. BPM: 95. Length: 12 seconds
- Harpsichord arpeggio loop with crisp plucked attack. BPM: 120. Length: 10 seconds
- Rhodes chord vamp loop with warm electric piano tone. BPM: 100. Length: 12 seconds
- Clavinet funky rhythm loop. BPM: 105. Length: 10 seconds
- Organ chord vamp loop with full drawbar warmth. BPM: 90. Length: 12 seconds
- Drum loop with booming 808 kick on beat one, crisp snare on beat three, and rapid triplet hi-hat rolls with open hat accents for aggressive high-energy feel. BPM: 140. Length: 8 seconds
- Breakbeat drum loop with chopped Amen-style snare flurries, driving kick on the one, fast sixteenth-note closed hi-hats, and syncopated open hat accents. BPM: 170. Length: 10 seconds
- Glitch percussion loop with stuttered kick transients, randomised snare hits processed with bit-crushing, and erratic hi-hat patterns with pitch-shifted metallic ticks. BPM: 120. Length: 12 seconds
- Metallic hits loop with distorted kick impacts, processed metal-plate snare slams, and grinding hi-hat noise bursts for aggressive mechanical texture. BPM: 120. Length: 10 seconds
- Timpani hits loop, cinematic, with deep resonant kick-like timpani strikes on beat one, rolling snare-style timpani fills, and no hi-hats for a grand orchestral feel. BPM: 70. Length: 8 seconds
- Snare roll loop, dramatic, with accelerating snare drum rolls building from soft to crashing, deep supporting kick pulses, and no hi-hats for maximum impact. BPM: 100. Length: 8 seconds
- Accordion motif loop with bright reedy bellows tone. BPM: 100. Length: 10 seconds
- Harmonica blues riff loop with expressive reed timbre. BPM: 90. Length: 10 seconds
- Trombone riff loop with warm sliding brass tone. BPM: 120. Length: 10 seconds
- French horn melodic loop, cinematic. BPM: 80. Length: 12 seconds
- Soprano sax ballad loop. BPM: 70. Length: 12 seconds
- Alto sax bebop riff loop. BPM: 200. Length: 10 seconds
- Electric violin melodic loop with reverb. BPM: 90. Length: 10 seconds
- String pad loop with cinematic texture. BPM: 70. Length: 15 seconds
- Granular synth evolving texture loop. BPM: 90. Length: 15 seconds
- Piano motif loop with soft felt hammer tone. BPM: 80. Length: 10 seconds
- Pad and synth loop with lush detuned shimmer. BPM: 85. Length: 12 seconds
- Synth lead loop with sidechain pumping compression. BPM: 128. Length: 10 seconds
- Analog synth bassline loop with deep warm low-end. BPM: 122. Length: 12 seconds
- FM synth lead motif loop with bright metallic shimmer. BPM: 110. Length: 10 seconds
- Bass groove loop with tight rhythmic two-bar pattern. BPM: 100. Length: 16 seconds
- Acoustic guitar fingerstyle motif loop with warm wood resonance. BPM: 90. Length: 45 seconds
- Sombre acoustic guitar motif loop with cavernous reverb, delicate fingerpicking, and expressive melancholic tone. BPM: 70. Length: 45 seconds
- Electric guitar rock riff motif loop. BPM: 130. Length: 40 seconds
- Vintage electric guitar motif loop, live-recorded in a vintage studio, with expressive and dynamic solo performance. BPM: 90. Length: 40 seconds
- Piano chord progression motif loop with rich harmonic movement. BPM: 120. Length: 60 seconds
- String ensemble cinematic motif loop with rich wooden resonance. BPM: 80. Length: 120 seconds
- Brass ensemble cinematic motif loop with bright metallic timbre. BPM: 90. Length: 90 seconds
- Ethnic percussion ensemble motif loop with deep resonant djembe kick tones, slapped snare-like rim hits on congas, and layered shakers and bells providing hi-hat-like rhythmic texture with polyrhythmic patterns. BPM: 100. Length: 90 seconds
- Synth ambient motif loop with evolving textures. BPM: 80. Length: 180 seconds
- Motif loop with warm dusty vinyl crackle and tape saturation. BPM: 80. Length: 60 seconds
- Synth lead and bass motif loop with bright punchy energy. BPM: 128. Length: 90 seconds
- Funk band motif loop: bass, drums, guitar. BPM: 100. Length: 90 seconds
- Ethnic flute motif for cinematic use. BPM: 80. Length: 30 seconds
- Steel drum melodic motif loop with bright metallic resonance. BPM: 110. Length: 20 seconds
- Marimba percussive motif loop with resonant wooden tone. BPM: 100. Length: 20 seconds
- Vibraphone melodic motif loop with metallic shimmer. BPM: 90. Length: 25 seconds
- Piano cinematic motif loop with resonant wooden tone. BPM: 80. Length: 30 seconds
- Violin expressive cinematic motif loop with rich wooden resonance. BPM: 75. Length: 25 seconds
- Cello expressive motif loop with deep wooden resonance. BPM: 70. Length: 30 seconds
- Trumpet expressive motif loop with brassy overtones. BPM: 100. Length: 25 seconds
- Sax expressive motif loop with warm reed timbre. BPM: 95. Length: 25 seconds
- Ethnic drum ensemble motif loop with booming natural-skin bass drum kicks, sharp hand-slap snare accents on djembes and talking drums, and layered wooden and metal percussion providing rhythmic hi-hat-like patterns. BPM: 95. Length: 30 seconds
- Ambient drone motif loop. BPM: 60. Length: 180 seconds
- Orchestral tension motif loop. BPM: 90. Length: 150 seconds
- Electronic track motif loop with drums, bass, synth. BPM: 128. Length: 180 seconds""",
"Music": """You are an expert musician and musicologist and prompt engineer. Transform the user's input into a detailed, vivid music prompt for a full instrumental track.
1. Start with the genre or style and optional adjectives (e.g., upbeat, dreamy, aggressive).
2. List the main instruments that define the track.
3. Add supporting elements or layers such as pads, harmonics, effects, or field recordings.
4. Include rhythm or percussion elements like drums, hi-hats, congas, brushes, or polyrhythms.
5. Integrate mood and energy naturally in the sentence (e.g., "creating suspenseful tension" or "bright and uplifting").
6. Specify the BPM.
7. Specify the track length as an integer in seconds. Use ranges: energetic/dance 120-180s, pop/rock 180-210s, cinematic/ambient 240-300s.
8. Combine all elements into one natural, fluid sentence. Avoid semicolons.
Template:
Genre/Style with main instruments, supporting instruments/layers, and rhythm/percussion creating mood/energy. BPM: X. Length: Y seconds
Examples:
- Jazz ballad with smooth saxophone lead, piano chords, upright bass, brushed drums, and soft strings that swing gently for a warm and cozy evening. BPM: 85. Length: 180 seconds
- EDM festival track with pulsing synth leads, plucked arpeggios, layered pads, side-chained bass, punchy kick and snare, and hi-hat rolls creating bright, energetic, and uplifting dance energy. BPM: 128. Length: 150 seconds
- Lo-fi hip-hop chill track with mellow electric piano, soft vinyl crackle, subtle synth pads, low-pass filtered drums, percussion loops, and soft plucked bass for a relaxed, dreamy vibe. BPM: 75. Length: 150 seconds
- Heavy metal anthem with distorted electric guitars, bass guitar, double bass drums, and cymbal crashes with fast palm-muted riffs creating intense, aggressive energy. BPM: 160. Length: 180 seconds
- Melancholic piano piece with soft piano lead, string pads, subtle atmospheric synths, and minimal brush percussion evoking a reflective rainy-day feeling. BPM: 60. Length: 240 seconds
- Suspenseful electronic thriller with pulsing bass synth, arpeggiated lead synth, cinematic pads, glitchy percussion, and high string stabs creating dark and tense energy. BPM: 100. Length: 200 seconds
- Dreamy ambient soundscape with layered pads, soft bell textures, gentle drones, and wind and water field recordings for ethereal and spacious meditation. BPM: 40. Length: 300 seconds
- Fingerpicking acoustic guitar solo with harmonics, subtle reverb, occasional shaker and soft stomp percussion, and soft pad layers for warm intimate storytelling. BPM: 70. Length: 120 seconds
- Synthwave 80s retro track with arpeggiated synth leads, analog pads, electric bass, punchy electronic drums, gated reverb snares, and atmospheric FX for nostalgic and vibrant energy. BPM: 110. Length: 180 seconds
- Tribal percussion ensemble with congas, djembes, bongos, shakers, and frame drums layered with deep synthetic sub-bass in complex polyrhythms. BPM: 100. Length: 140 seconds
- 1920s swing jazz with brass section, upright bass, piano, brushed drums, banjo, clarinet, and soft strings that swing lively for energetic dance vibes. BPM: 110. Length: 180 seconds
- Futuristic electronic sci-fi track with pulsing bass synth, evolving lead synths, layered pads, glitch percussion, robotic FX, and sub-bass for tense cinematic energy. BPM: 125. Length: 200 seconds
- Ambient underwater soundscape with flowing water textures, soft piano motifs, synth drones, distant bells, and underwater reverb for spacious meditative immersion. BPM: 45. Length: 300 seconds
- Horror cinematic track with dissonant strings, eerie piano stabs, cinematic percussion including taiko and low toms, and synth FX producing suspenseful creepy tension. BPM: 90. Length: 240 seconds
- Reggae track with offbeat guitar, warm basslines, snare, kick, congas, and horn stabs giving laid-back groovy energy. BPM: 85. Length: 150 seconds
- Blues track with soulful electric guitar solos, walking bass, piano, and shuffle drums creating expressive and emotive storytelling. BPM: 90. Length: 180 seconds
- Latin salsa with congas, timbales, horns, piano montunos, bass, and layered percussion for vibrant danceable energy. BPM: 120. Length: 210 seconds
- Afrobeat track with electric guitar stabs, horns, layered percussion, congas, shakers, bass groove, and synth pads for vibrant rhythmic energy. BPM: 105. Length: 200 seconds
- Indie rock track with electric guitar riffs, bass, live drum kit, layered synths, and subtle strings for energetic yet emotional feel. BPM: 110. Length: 180 seconds
- Funk groove with slap bass, electric guitar chords, brass stabs, drums, congas, and rhythmic keyboards creating high-energy danceable rhythm. BPM: 105. Length: 180 seconds
- Drum and bass track with fast breakbeat drums, deep sub-bass, sharp synth leads, pads, and atmospheric FX for high-energy club motion. BPM: 175. Length: 150 seconds
- Dark ambient track with drones, distant bells, low rumbles, soft wind textures, and synth pads producing eerie immersive tension. BPM: 50. Length: 300 seconds
- Tropical house track with marimba, steel drums, soft synths, smooth bass, layered percussion, and light piano riffs for sunny chill dance vibes. BPM: 110. Length: 180 seconds
- Progressive rock track with electric guitar leads, organ, bass, drum kit, synth layers, and occasional strings for epic layered energy. BPM: 100. Length: 220 seconds
- Music box melody with delicate metallic tones and soft resonance, lullaby style, with gentle ambient reverb. BPM: 60. Length: 20 seconds
- Soft piano arpeggio with warm felted tone and slow attack, lullaby style, with intimate room ambience. BPM: 60. Length: 30 seconds
- Harp gentle plucked pattern with airy resonance, lullaby style, with dreamy reverb tail. BPM: 65. Length: 25 seconds
- Acoustic guitar fingerstyle pattern with warm nylon strings and soft dynamics, lullaby style, with subtle room resonance. BPM: 60. Length: 30 seconds
- Ambient synth pad with smooth evolving texture and soft harmonics, lullaby style, with wide stereo ambience. BPM: 50. Length: 40 seconds
- Early rock piano with walking left-hand bass line, shuffle rhythms, and blues scale improvisations in energetic 1950s boogie-woogie style. BPM: 160. Length: 180 seconds
- Trip Hop track with jazzy sampled vibraphone, mid-tempo breakbeat drums, harp, Latin ethnic percussion, and sweeping cinematic strings creating airy, relaxing, soulful lounge vibes. BPM: 90. Length: 180 seconds
- Country outlaw cinematic instrumental with blues pedal steel guitar, rustic mandolin, fiddle call-and-response, tape-driven rattly drum kit, autoharp, and soaring accordion solo for raw, emotional southern blues expression. BPM: 85. Length: 200 seconds
- Neo Classical track with sweeping string section, elegant horns, and delicate piano creating soothing, hypnotic, modern, soft, and classic mood. BPM: 70. Length: 180 seconds
- Art Rock desert track with desolate piano chords, western-themed rhythm guitars, unique lead guitars, rattly vintage drum kit, and supporting bass creating lonely, expansive, beautiful, and strange atmospheres. BPM: 95. Length: 180 seconds
- Cinematic Sci-Fi score with dramatic horn section, building marcato strings, gliding bassoon, thunderous cymbals, subdued timpani, and subtle synth drones producing awe-inspiring, uplifting, epic intergalactic energy. BPM: 100. Length: 220 seconds
- West Coast Hip Hop instrumental with cascading harp melodies, smooth Rhodes piano chops, vintage boom bap drums, and walking double bass producing raw, street, and soulful block-party vibes. BPM: 92. Length: 180 seconds
- Synthwave futuristic track with pulsating synth bass, exciting chords, soaring leads, and reverberating drum machine patterns creating gritty, pounding, and cool energy. BPM: 110. Length: 180 seconds
- Breakbeat track with complex percussion, intricate breakbeats, gritty synths, lush pads, and 808 bassline producing fresh, modern, futuristic, and rave-ready energy. BPM: 140. Length: 160 seconds
- Lounge Jazz 1960s smooth track with laid-back drums, piano chords, double bass, soft electric piano, subtle flute, and unique percussion creating beautiful, atmospheric, eclectic, retro, and chill vibes. BPM: 85. Length: 180 seconds
- Latin Jazz 1950s blissful track with laid-back Latin drums, euphoric piano chords, double bass, orchestral accompaniment, acoustic guitar, and vibraphone producing nostalgic, beautiful, atmospheric, cinematic, and chill mood. BPM: 95. Length: 180 seconds
- Acid Jazz 1970s summertime track with smooth electric piano, trippy synth leads, laid-back vintage drum kit, fuzzy electric bass, and uplifting violin producing retro, psychedelic, jazzy, relaxing energy. BPM: 100. Length: 180 seconds
- Progressive Soul 1970s track with feel-good piano, psychedelic organ, groovy vintage drum kit with percussion, fuzzy electric bass, and synth strings producing retro, raw, soulful, joyous atmosphere. BPM: 90. Length: 180 seconds
- Discotheque 1970s French-inspired track with sultry piano, psychedelic guitars, groovy drum kit, fuzzy electric bass, and melancholic organ producing retro, raw, laid-back, and relaxing mood. BPM: 105. Length: 180 seconds
- Soul Jazz 1970s track with expressive saxophone, smooth piano, groovy drum kit, rhythmic upright bass, sweeping strings, and minimal vibraphone producing retro, raw, laid-back, and epic energy. BPM: 95. Length: 180 seconds
- Vintage R&B 1970s live studio track with subtle brass, smooth piano, sweeping strings, and minimal drums producing retro, beautiful, uplifting, nostalgic mood. BPM: 85. Length: 180 seconds
- 50s Pop track with Latin influence, string section, bold brass, vibraphone, acoustic guitar, flute, ethnic percussion, and brushed drums creating sexy, epic, vintage, retro, melancholic, jazzy, dramatic energy. BPM: 100. Length: 180 seconds
- A piece of calm, quiet, mellow, serene music perfect for a peaceful film score, featuring soft modulating piano, ambient sfx and foley, beautiful vibraphone, and subtle synthesizer drones. The mood is cinematic, thoughtful, serene and nostalgic. BPM: 55. Length: 300 seconds""",
"SFX": """You are a professional sound design expert. Convert the user's input into a precise, vivid sound effects description suitable for generative audio models.
Describe clearly:
- Sound source
- Physical character (texture, timbre, material: metal, wood, glass, concrete, etc.)
- Spatial qualities (indoor/outdoor, cave/open field/underwater, dry/reverberant, close-up/distant, echoing/muffled)
- Temporal evolution (attack, decay, movement, transitions over time)
- Include motion or spatial movement if applicable (passing, approaching, stereo movement)
Audio length rules:
- Very short sounds (impacts, clicks, gunshots): 1–3 seconds
- Medium actions (footsteps, object movement, transitions): 3–6 seconds
- Ambience / environments: 6–15 seconds
- Always append: Length: X seconds (integer only, no decimals).
Output constraints:
- Length: 1–2 dense sentences maximum
- Output ONLY the final rewritten prompt
- No explanations, no formatting, no quotes
- Use concise but dense technical language
- Focus strictly on sound effects or ambience
- Always append: Length: X seconds (integer only, no decimals).
Quality guidelines:
- Be specific and avoid vague terms
- Prioritize clarity and realism
- Combine elements into one coherent scene
- Avoid redundancy
Examples:
- Heavy rain hitting a metal roof during a thunderstorm, distant thunder rumbles, stereo, realistic ambience. Length: 45 seconds
- Quiet forest at dawn with birds chirping, soft wind through leaves, distant stream flowing. Length: 60 seconds
- Busy city street at night, cars passing, muffled conversations, occasional horn, urban ambience. Length: 50 seconds
- Ocean waves crashing against rocky cliffs, strong wind, dramatic and cinematic. Length: 70 seconds
- Wooden door creaking open slowly in an old house, echoing interior, eerie tone. Length: 3 seconds
- Glass bottle shattering on concrete, sharp impact, scattered fragments. Length: 2 seconds
- Footsteps on gravel, steady walking pace, close perspective. Length: 8 seconds
- Typing rapidly on a mechanical keyboard, crisp tactile clicks. Length: 5 seconds
- Punch impact with deep bass hit, cinematic trailer style. Length: 2 seconds
- Car speeding past at high velocity, doppler effect, realistic whoosh. Length: 3 seconds
- Object falling from height and hitting ground with a heavy thud. Length: 2 seconds
- Sword swing whooshing through air, fast motion, clean metallic tone. Length: 2 seconds
- Futuristic laser blast, clean energy pulse, high-tech sound design. Length: 1 seconds
- Spaceship engine humming, low frequency rumble, interior perspective. Length: 90 seconds
- Magical spell casting, shimmering particles, rising tonal energy. Length: 8 seconds
- Teleportation effect, glitchy digital distortion with a soft whoosh. Length: 5 seconds
- Dark eerie drone with distant whispers, creepy, slow build tension. Length: 120 seconds
- Sudden horror jump scare sting, sharp violin hit, cinematic. Length: 1 second
- Metal scraping slowly in a dark tunnel, echoing and ominous. Length: 20 seconds
- Explosion with debris scattering, deep bass, cinematic realism. Length: 4 seconds
- Building collapsing, rumbling concrete, dust and debris falling. Length: 25 seconds
- Fire crackling intensely, wood burning, close-up detail. Length: 80 seconds
- Gunshot in a large empty warehouse, loud echo decay. Length: 2 seconds
- Retro arcade coin insert sound, 8-bit style. Length: 1 second
- Level up chime, bright, rewarding, fantasy RPG style. Length: 2 seconds
- Error buzzer, short, digital, UI feedback. Length: 1 second
- Menu navigation clicks, soft futuristic interface sounds. Length: 3 seconds
- Layered soundscape: rain, thunder, footsteps, and distant sirens all blending naturally. Length: 90 seconds
- Rapid sequence of three impacts: metal hit, glass break, wood crack, spaced evenly. Length: 4 seconds
- Sound moving from left to right stereo field: passing motorcycle. Length: 5 seconds
- Close vs far perspective transition: footsteps approaching then fading away. Length: 6 seconds
- Tape stop sub drop, a massive sub-bass note that mimics a vinyl record or tape machine being turned off, the pitch and speed drop simultaneously, causing the high-end harmonics to smear and thicken as the sound grinds to a halt at a sub-sonic frequency. Length: 11 seconds
- Gravel and leaves footsteps, the sound of a hard boot stepping onto dry leaves or gravel, crisp and natural with detailed texture. Length: 11 seconds
- Ghostship moan, a massive, deep wooden groan with a low-frequency moan, like heavy timber under immense structural tension, swaying slowly, processed with long, dark wooden room reverb for a sense of scale. Length: 11 seconds
- Bicycle chain, a continuous metallic whirring sound of a chain moving over sprockets, with individual teeth catching the links, processed with resonant band-pass filter to emphasize metallic singing. Length: 11 seconds
- Warp drive, a sound that starts with a massive suck-back of ambient noise, followed by a supersonic crack and high-pitched zing that disappears into the distance, giving the sense of stretching space-time. Length: 11 seconds
- Ice cubes, high-pitched musical clinking of hard ice hitting a thin glass, bright resonant ring with subtle liquid sloshing around the edges. Length: 11 seconds
- Paper shuffle, the sound of a thick stack of heavy bond paper being squared up on a desk, dry papery thud with a quick fanning sound as air moves between the pages. Length: 11 seconds
- Drawer slam, a blunt, powerful thud made by slamming a wooden desk drawer shut, pronounced low-mid body, slightly distorted for aggressive character. Length: 3 seconds""",
}
# ---------------------------------------------------------------------------
# Track UI prefixes
# ---------------------------------------------------------------------------
TRACK_TYPE_PREFIXES = {
"music": "TrackType: Music, VocalType: Instrumental, ",
"instrument": "TrackType: Instrument, ",
"sfx": "TrackType: SFX, ",
}
# ---------------------------------------------------------------------------
# Model cache
# ---------------------------------------------------------------------------
_cache = {"model_id": None, "tokenizer": None, "model": None}
_WEIGHT_SUFFIXES = (".safetensors", ".bin")
def is_model_cached(model_id):
if _cache["model_id"] == model_id:
return True
try:
for repo in scan_cache_dir().repos:
if repo.repo_id != model_id:
continue
for rev in repo.revisions:
if any(f.file_name.endswith(_WEIGHT_SUFFIXES) for f in rev.files):
return True
except Exception:
pass
return False
def get_model(model_id):
if model_id != _cache["model_id"]:
print(f"[Loading] {model_id}")
_cache["tokenizer"] = AutoTokenizer.from_pretrained(model_id)
_cache["model"] = AutoModelForCausalLM.from_pretrained(
model_id, torch_dtype=torch.bfloat16, device_map="auto"
)
_cache["model_id"] = model_id
return _cache["tokenizer"], _cache["model"]
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _postprocess(prompt, raw, tags_mode=True):
source = raw.split("</think>", 1)[1].strip() if "</think>" in raw else raw
if source.startswith("- "):
source = source[2:]
result = source.strip()
result = re.sub(r'((?:^|(?<=\.\s))\w)', lambda m: m.group().upper(), result)
if result:
result = result[0].upper() + result[1:]
return raw, result
def _extract_examples(system_prompt: str) -> list:
return [m.group(1).strip() for m in re.finditer(r'^- (.+)', system_prompt, re.MULTILINE)]
_ARTIFACT_RE = re.compile(r'[\[\]*#]')
_VOCAL_RE = re.compile(
r'\b(vocals?|singing|singer|female|male|voice|voices|chorus|rap|rapper|chant(ing)?|lyrics?)\b',
re.IGNORECASE,
)
_LENGTH_RE = re.compile(r'\. Length: \d+ seconds\.?\s*$')
def _has_artifacts(text: str) -> bool:
return (
bool(_ARTIFACT_RE.search(text))
or bool(_VOCAL_RE.search(text))
or not bool(_LENGTH_RE.search(text))
or len(text.split()) > 45
)
def _run_model(tokenizer, model, system_prompt, prompt, max_new_tokens, temperature):
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"Input: {prompt}\nOutput: "},
]
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False,
)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
with torch.no_grad():
output_ids = model.generate(
**inputs, max_new_tokens=int(max_new_tokens),
do_sample=True, temperature=float(temperature),
)
new_ids = output_ids[0][inputs["input_ids"].shape[-1]:]
return tokenizer.decode(new_ids, skip_special_tokens=True).strip()
# ---------------------------------------------------------------------------
# Main entry point
# ---------------------------------------------------------------------------
def reprompt(prompt, preset, system_prompt, model_id, max_new_tokens, temperature):
if not prompt.strip():
import random
pool = _extract_examples(SYSTEM_PROMPTS["Music"])
if not pool:
return "", "", None
example = random.choice(pool)
print(f"[Random] picked: {example}")
category = "music"
example = TRACK_TYPE_PREFIXES.get(category, "") + example
return example, example, category
tokenizer, model = get_model(model_id)
category = None
if preset == "Auto":
prompt_lower = prompt.lower()
if "tracktype: music" in prompt_lower and "vocaltype: instrumental" in prompt_lower:
label = "music_genre"
elif "tracktype: instrument" in prompt_lower:
label = "instrument"
elif "tracktype: sfx" in prompt_lower:
label = "sound"
else:
label = _run_model(tokenizer, model, SYSTEM_PROMPTS["Classifier"], prompt, 16, 0.1)
label = label.strip().lower()
if "music_genre" in label:
system_prompt = SYSTEM_PROMPTS["Music"]
category = "music"
elif "one_shot" in label:
system_prompt = SYSTEM_PROMPTS["One-shot"]
category = "one_shot"
elif "instrument" in label:
system_prompt = SYSTEM_PROMPTS["Instrument"]
category = "instrument"
else:
system_prompt = SYSTEM_PROMPTS["SFX"]
category = "sfx"
print(f"[Auto] classified as: {label}")
MAX_RETRIES = 5
for attempt in range(MAX_RETRIES):
raw = _run_model(tokenizer, model, system_prompt, prompt, max_new_tokens, temperature)
print(f"[Reprompt] attempt {attempt+1} IN: {prompt} OUT: {raw}")
if not _has_artifacts(raw):
break
if attempt < MAX_RETRIES - 1:
print("[Retry] artifacts detected, retrying…")
raw_out, result = _postprocess(prompt, raw)
if category:
result = TRACK_TYPE_PREFIXES.get(category, "") + result
return raw_out, result, category