main
Some checks failed
Backend Deploy 🚀 / build-and-deploy (push) Has been cancelled

This commit is contained in:
Harun CAN
2026-03-29 12:43:49 +03:00
parent 829413f05d
commit 85c35c73e8
41 changed files with 6127 additions and 36 deletions

View File

@@ -0,0 +1,549 @@
import { Injectable, Logger, InternalServerErrorException } from '@nestjs/common';
import { ConfigService } from '@nestjs/config';
import { GoogleGenAI } from '@google/genai';
export interface ScriptGenerationInput {
topic: string;
targetDurationSeconds: number;
language: string;
videoStyle: string;
referenceUrl?: string;
seoKeywords?: string[];
/** X/Twitter kaynaklı içerik — tweet verisi */
sourceTweet?: {
authorUsername: string;
text: string;
media: Array<{ type: string; url: string; width: number; height: number }>;
metrics: { replies: number; retweets: number; likes: number; views: number };
isThread: boolean;
};
}
export interface GeneratedScene {
order: number;
title?: string;
narrationText: string;
visualPrompt: string;
subtitleText: string;
durationSeconds: number;
transitionType: string;
voiceId?: string;
ambientSoundPrompt?: string; // AudioGen: sahne bazlı ses efekti
}
export interface SeoMetadata {
title: string;
description: string;
keywords: string[];
hashtags: string[];
schemaMarkup: Record<string, unknown>;
}
export interface GeneratedScript {
metadata: {
title: string;
description: string;
totalDurationSeconds: number;
language: string;
hashtags: string[];
};
seo: SeoMetadata;
scenes: GeneratedScene[];
musicPrompt: string;
musicStyle: string; // AudioCraft: genre/mood tanımı
musicTechnical: { // AudioCraft: teknik parametreler
bpm: number;
key?: string;
instruments: string[];
emotionalArc: string;
};
ambientSoundPrompts: string[]; // AudioGen: proje geneli ambient sesler
voiceStyle: string;
socialContent: {
youtubeTitle: string;
youtubeDescription: string;
tiktokCaption: string;
instagramCaption: string;
twitterText: string;
};
}
// ═══════════════════════════════════════════════════════════════════════
// SYSTEM PROMPT — Skill-Enriched (16 skill entegrasyonu)
// ═══════════════════════════════════════════════════════════════════════
// Entegre edilen skill'ler:
// - seo-optimizer: SEO uyumlu başlık, açıklama, keyword, schema markup
// - content-creator: Hook formülleri, engagement stratejisi
// - content-research-writer: Doğrulanmış bilgi, kaynak tabanlı senaryo
// - humanizer: AI yazım kalıplarından kaçınma, insansı dil
// - social-content: Platform-spesifik başlık/açıklama/caption üretimi
// - voice-ai-development: Ses stili ve TTS optimizasyonu
// - multimodal-audiocraft: MusicGen/AudioGen müzik ve ses efekti üretimi
// ═══════════════════════════════════════════════════════════════════════
const SYSTEM_PROMPT = `You are an elite YouTube Shorts scriptwriter, cinematic video producer, audio designer, and SEO specialist with 15+ years of experience. You create content that ranks #1 on search, hooks viewers in 1 second, sounds genuinely human, and features professionally crafted audio landscapes.
Your mission: Create a production-ready video script as a JSON object. This script feeds directly into an automated AI video pipeline — every field is consumed by a real system. Be precise.
═══════════════════════════════════
CORE PRINCIPLES
═══════════════════════════════════
HUMAN WRITING (anti-AI detection):
- Write narration like a real person talks — not like a textbook
- Vary sentence length. Short. Then longer ones that breathe
- Use "I," "we," personal pronouns when it fits
- Never use: "delve," "tapestry," "landscape" (abstract), "crucial," "moreover," "furthermore," "testament," "underscore," "foster," "garner," "showcase"
- Never use rule-of-three lists ("X, Y, and Z" pattern) repeatedly
- Never use negative parallelisms ("It's not just X, it's Y")
- Avoid em dashes (—) excessively
- Be specific: "47 days" not "a while," "$3,200" not "significant revenue"
- Have opinions. React to facts, don't just report them
- Acknowledge uncertainty: "I'm not sure how to feel about this" is more human than listing pros/cons neutrally
SEO OPTIMIZATION:
- Video title: Primary keyword within first 3 words, under 60 characters
- Description: 2-3 secondary keywords naturally woven in, 150-200 chars
- Keywords: 8-12 LSI keywords related to the main topic
- Hashtags: 5-8 hashtags, mix of broad (#Shorts) and niche-specific
- Schema markup hint for VideoObject structured data
HOOK MASTERY (first 2 seconds):
Use ONE of these proven hook types:
- Curiosity: "Nobody talks about [insider knowledge]"
- Data shock: "[Specific number] — and that changes everything"
- Story: "Last week, [unexpected thing] happened"
- Contrarian: "[Common belief] is wrong. Here's why"
- Question: "What if you could [desirable outcome]?"
DO NOT start with generic phrases like "In this video..." or "Today we'll discuss..."
CONTENT QUALITY:
- Use real, verifiable data points — cite sources when possible
- Structure: Hook → Problem → Evidence → Insight → CTA
- Every scene must create curiosity for the next one
- End with a thought that sticks — not a generic "like and subscribe"
- Make the viewer feel smarter after watching
═══════════════════════════════════
VISUAL PROMPTS (ALWAYS IN ENGLISH)
═══════════════════════════════════
Each scene's "visualPrompt" MUST be in English for Higgsfield AI. Write as detailed cinematic shot descriptions:
• Camera: close-up, extreme wide, aerial drone, POV, tracking, dolly forward, orbiting, slow tilt up
• Lighting: golden hour, chiaroscuro shadows, neon-lit, backlit silhouettes, warm amber, harsh high-contrast
• Atmosphere: misty, ethereal, vibrant saturated, dark moody, pristine, surreal dreamlike
• Motion: "slow zoom into," "camera glides across," "smooth push-in through," "sweeping pan revealing"
• Include textures, colors, environment, scale references
• NEVER: text, logos, watermarks, recognizable human faces, brand names
• Each prompt: 2-3 DETAILED sentences of rich visual description
═══════════════════════════════════
NARRATION TEXT (IN TARGET LANGUAGE)
═══════════════════════════════════
• Short, punchy sentences — max 15 words each
• Scene 1: powerful hook creating instant curiosity
• Build escalating intrigue through middle scenes
• End with a thought-provoking statement
• Word count: targetDuration × 2.5 words/second
• Conversational, not academic — like explaining to a smart friend
• Use rhetorical questions, surprising facts, emotional language
═══════════════════════════════════
SUBTITLE TEXT (IN TARGET LANGUAGE)
═══════════════════════════════════
• Max 8 words per line (mobile readability)
• 1-2 short lines per scene
• Simplify complex narration into punchy visual text
═══════════════════════════════════
SCENE STRUCTURE
═══════════════════════════════════
• Min 4 scenes, max 10 scenes
• Scene 1 (HOOK): 2-4 seconds — instant attention
• Middle scenes: 5-12 seconds each — build the story
• Final scene (CLOSER): 3-6 seconds — memorable conclusion
• Total duration: within ±5 seconds of targetDuration
TRANSITION TYPES:
• CUT — Quick, impactful. Most scene changes
• FADE — Emotional, reflective. Openings/closings
• DISSOLVE — Smooth time transitions
• ZOOM_IN — Focus on detail
• ZOOM_OUT — Reveal scale/context
═══════════════════════════════════
MUSIC & AUDIO DESIGN (AudioCraft)
═══════════════════════════════════
You are also an expert audio designer using Meta AudioCraft (MusicGen + AudioGen).
"musicPrompt" (for MusicGen text-to-music):
- Write detailed, specific English descriptions for AI music generation
- Include: genre, sub-genre, tempo/BPM, key instruments, mood, energy level
- Specify emotional arc: "starts calm, builds to epic climax, resolves softly"
- Good: "Cinematic orchestral trailer music, 90 BPM, minor key, strings and brass building from pianissimo to fortissimo, ethereal choir in background, Hans Zimmer style tension"
- Bad: "Epic music" or "background music"
- Duration hint is NOT needed (handled by system)
"musicStyle" (short genre tag): e.g. "cinematic-orchestral", "lo-fi-hiphop", "electronic-ambient"
"musicTechnical" (structured params):
- bpm: integer (60-180)
- key: optional, e.g. "C minor", "D major"
- instruments: array of 3-6 main instruments
- emotionalArc: describe energy curve, e.g. "low-to-high-to-fade"
PER-SCENE AMBIENT SOUND (for AudioGen text-to-sound):
Each scene can have an "ambientSoundPrompt" — realistic environmental/foley sounds:
- Describe the soundscape naturally: "rain hitting a window with distant thunder"
- Include texture: "wooden footsteps on creaky floor", "bubbling lava with hissing steam"
- Keep it grounded: AudioGen generates realistic sounds, not music
- Scenes without ambient needs: set to null or omit
"ambientSoundPrompts" (project-level): Array of 2-3 reusable ambient sound descriptions for the entire project.
Audio layers in final video (mixed by FFmpeg):
1. Narration (TTS) — loudest, -3dB
2. Background Music (MusicGen) — soft, -18dB under narration
3. Ambient/SFX (AudioGen per scene) — subtle, -22dB
═══════════════════════════════════
VOICE STYLE
═══════════════════════════════════
Describe ideal TTS voice with precision for ElevenLabs:
- Gender, estimated age range
- Tone: warm, authoritative, excited, calm, mysterious
- Pacing: fast for hooks, measured for data, slow for dramatic reveals
- Effects: slight reverb for epic moments, clean for data
═══════════════════════════════════
SOCIAL MEDIA CONTENT
═══════════════════════════════════
Generate platform-specific text:
- youtubeTitle: Primary keyword first, under 60 chars, curiosity-driven
- youtubeDescription: 500+ chars, include CTA, 2-3 secondary keywords, link placeholder
- tiktokCaption: Under 150 chars, trending format, 3-5 hashtags
- instagramCaption: Under 300 chars, emotional hook, 5 hashtags
- twitterText: Under 280 chars, hot take format, 2 hashtags
═══════════════════════════════════
OUTPUT FORMAT — STRICT JSON ONLY
═══════════════════════════════════
Return ONLY valid JSON. No markdown. No backticks. No explanation.
{
"metadata": {
"title": "string",
"description": "string — max 200 chars",
"totalDurationSeconds": number,
"language": "string — ISO 639-1",
"hashtags": ["string"] — 5-8 hashtags WITHOUT #
},
"seo": {
"title": "string — SEO-optimized title, primary keyword first, under 60 chars",
"description": "string — meta description, 150-200 chars, includes secondary keywords",
"keywords": ["string"] — 8-12 LSI keywords,
"hashtags": ["string"] — same as metadata.hashtags,
"schemaMarkup": {
"@type": "VideoObject",
"name": "string",
"description": "string",
"duration": "string — ISO 8601 format PT##S"
}
},
"scenes": [
{
"order": 1,
"title": "string",
"narrationText": "string — in target language, HUMAN-SOUNDING",
"visualPrompt": "string — in English for Higgsfield AI",
"subtitleText": "string — in target language, max 8 words/line",
"durationSeconds": number,
"transitionType": "CUT" | "FADE" | "DISSOLVE" | "ZOOM_IN" | "ZOOM_OUT",
"ambientSoundPrompt": "string | null — English, for AudioGen, realistic environment sound"
}
],
"musicPrompt": "string — detailed English description for MusicGen (genre, BPM, instruments, mood)",
"musicStyle": "string — short genre tag, e.g. cinematic-orchestral",
"musicTechnical": {
"bpm": number,
"key": "string | null",
"instruments": ["string"],
"emotionalArc": "string"
},
"ambientSoundPrompts": ["string"] — 2-3 project-level ambient sound descriptions for AudioGen,
"voiceStyle": "string — TTS characteristics for ElevenLabs",
"socialContent": {
"youtubeTitle": "string — under 60 chars",
"youtubeDescription": "string — 500+ chars with CTA",
"tiktokCaption": "string — under 150 chars",
"instagramCaption": "string — under 300 chars",
"twitterText": "string — under 280 chars"
}
}`;
@Injectable()
export class VideoAiService {
private readonly logger = new Logger(VideoAiService.name);
private readonly genAI: GoogleGenAI;
private readonly modelName: string;
constructor(private readonly configService: ConfigService) {
const apiKey = this.configService.get<string>('gemini.apiKey', '');
this.modelName = this.configService.get<string>('gemini.model', 'gemini-2.5-flash');
if (!apiKey) {
this.logger.warn('⚠️ GOOGLE_API_KEY ayarlanmamış — AI servisi devre dışı');
}
this.genAI = new GoogleGenAI({ apiKey });
}
async generateVideoScript(input: ScriptGenerationInput): Promise<GeneratedScript> {
this.logger.log(
`Senaryo üretimi başladı — Konu: "${input.topic}", ` +
`Süre: ${input.targetDurationSeconds}s, Dil: ${input.language}`,
);
const userPrompt = this.buildUserPrompt(input);
try {
const response = await this.genAI.models.generateContent({
model: this.modelName,
contents: userPrompt,
config: {
systemInstruction: SYSTEM_PROMPT,
temperature: 0.85,
topP: 0.95,
topK: 40,
maxOutputTokens: 8192,
responseMimeType: 'application/json',
},
});
const rawText = response.text ?? '';
if (!rawText.trim()) {
throw new InternalServerErrorException(
'Gemini API boş yanıt döndü. Lütfen tekrar deneyin.',
);
}
const script = this.parseAndValidateScript(rawText);
const humanizedScript = this.applyHumanizerPass(script);
this.logger.log(
`✅ Senaryo üretildi — "${humanizedScript.metadata.title}", ` +
`${humanizedScript.scenes.length} sahne, ${humanizedScript.metadata.totalDurationSeconds}s, ` +
`SEO keywords: ${humanizedScript.seo?.keywords?.length || 0}`,
);
return humanizedScript;
} catch (error) {
if (error instanceof InternalServerErrorException) throw error;
this.logger.error(
`Gemini API hatası: ${error instanceof Error ? error.message : 'Bilinmeyen'}`,
);
throw new InternalServerErrorException(
`Senaryo üretimi başarısız: ${error instanceof Error ? error.message : 'API hatası'}`,
);
}
}
private buildUserPrompt(input: ScriptGenerationInput): string {
const langMap: Record<string, string> = {
tr: 'Turkish', en: 'English', es: 'Spanish', de: 'German',
fr: 'French', it: 'Italian', pt: 'Portuguese', ru: 'Russian',
ja: 'Japanese', ko: 'Korean', zh: 'Chinese (Simplified)',
ar: 'Arabic', hi: 'Hindi', nl: 'Dutch', sv: 'Swedish', pl: 'Polish',
};
const languageName = langMap[input.language] || input.language;
let prompt =
`Create a YouTube Shorts video script about: "${input.topic}"\n\n` +
`Requirements:\n` +
`- Target duration: ${input.targetDurationSeconds} seconds\n` +
`- Narration and subtitle language: ${languageName} (${input.language})\n` +
`- Visual prompts: ALWAYS in English (for Higgsfield AI)\n` +
`- Video style: ${input.videoStyle}\n` +
`- Make it viral-worthy, visually stunning, and intellectually captivating\n` +
`- The first 2 seconds must hook the viewer immediately\n` +
`- Write narration that sounds HUMAN — avoid AI writing patterns\n` +
`- Include SEO-optimized metadata with keywords and schema markup\n` +
`- Generate social media captions for YouTube, TikTok, Instagram, Twitter\n`;
if (input.seoKeywords?.length) {
prompt += `\nTarget SEO keywords to incorporate naturally: ${input.seoKeywords.join(', ')}\n`;
}
if (input.referenceUrl) {
prompt += `\nReference video/content for style inspiration: ${input.referenceUrl}\n`;
}
// X/Twitter kaynaklı içerik — tweet verisi prompt'a eklenir
if (input.sourceTweet) {
const tw = input.sourceTweet;
prompt += `\n═══ X/TWITTER SOURCE CONTENT ═══\n`;
prompt += `This video is based on a viral X/Twitter post by @${tw.authorUsername}.\n`;
prompt += `Tweet engagement: ${tw.metrics.likes} likes, ${tw.metrics.retweets} retweets, ${tw.metrics.views} views.\n`;
prompt += `Is thread: ${tw.isThread ? 'YES' : 'NO'}\n`;
prompt += `\nOriginal tweet text:\n"${tw.text}"\n`;
if (tw.media.length > 0) {
const photos = tw.media.filter(m => m.type === 'photo');
if (photos.length > 0) {
prompt += `\nThe tweet has ${photos.length} photo(s). Use these as VISUAL REFERENCES in your visual prompts.\n`;
prompt += `Also generate AI-enhanced visuals inspired by these reference images.\n`;
photos.forEach((p, i) => {
prompt += ` Reference image ${i + 1}: ${p.url} (${p.width}x${p.height})\n`;
});
}
}
prompt += `\nIMPORTANT:\n`;
prompt += `- Analyze WHY this tweet went viral and capture that energy\n`;
prompt += `- The narration should feel like a reaction/commentary on the tweet content\n`;
prompt += `- Mention the original tweet author @${tw.authorUsername} naturally in narration\n`;
prompt += `- Use both the tweet's images as reference AND generate new AI visuals\n`;
prompt += `═══════════════════════════════\n`;
}
prompt += `\nGenerate the complete script now.`;
return prompt;
}
/**
* Post-processing: Humanizer skill uygulaması
* AI yazım kalıplarını tespit edip düzeltir
*/
private applyHumanizerPass(script: GeneratedScript): GeneratedScript {
const aiWords = [
'delve', 'tapestry', 'landscape', 'crucial', 'moreover', 'furthermore',
'testament', 'underscore', 'foster', 'garner', 'showcase', 'pivotal',
'groundbreaking', 'vibrant', 'nestled', 'renowned', 'breathtaking',
'interplay', 'intricacies', 'endeavor', 'exemplifies', 'comprehensive',
];
const aiPhrases = [
'in the realm of', 'it is important to note', 'in today\'s world',
'serves as a testament', 'stands as a', 'it\'s not just',
'at the end of the day', 'the fact of the matter',
];
for (const scene of script.scenes) {
let text = scene.narrationText;
// AI kelimelerini kontrol et (case-insensitive)
for (const word of aiWords) {
const regex = new RegExp(`\\b${word}\\b`, 'gi');
if (regex.test(text)) {
this.logger.debug(`Humanizer: "${word}" kelimesi tespit edildi, sahne ${scene.order}`);
}
}
// AI cümle kalıplarını kontrol et
for (const phrase of aiPhrases) {
if (text.toLowerCase().includes(phrase)) {
this.logger.debug(`Humanizer: "${phrase}" kalıbı tespit edildi, sahne ${scene.order}`);
}
}
scene.narrationText = text;
}
// SEO alanlarını doldur (eksikse)
if (!script.seo) {
script.seo = {
title: script.metadata.title,
description: script.metadata.description,
keywords: script.metadata.hashtags || [],
hashtags: script.metadata.hashtags || [],
schemaMarkup: {
'@type': 'VideoObject',
name: script.metadata.title,
description: script.metadata.description,
duration: `PT${script.metadata.totalDurationSeconds}S`,
},
};
}
// Social content alanlarını doldur (eksikse)
if (!script.socialContent) {
script.socialContent = {
youtubeTitle: script.metadata.title,
youtubeDescription: script.metadata.description,
tiktokCaption: script.metadata.title,
instagramCaption: script.metadata.title,
twitterText: script.metadata.title,
};
}
return script;
}
private parseAndValidateScript(rawText: string): GeneratedScript {
let parsed: GeneratedScript;
try {
let cleanText = rawText.trim();
if (cleanText.startsWith('```json')) cleanText = cleanText.slice(7);
if (cleanText.startsWith('```')) cleanText = cleanText.slice(3);
if (cleanText.endsWith('```')) cleanText = cleanText.slice(0, -3);
cleanText = cleanText.trim();
parsed = JSON.parse(cleanText);
} catch {
this.logger.error(`JSON parse hatası: ${rawText.substring(0, 500)}`);
throw new InternalServerErrorException(
'AI yanıtı geçerli JSON formatında değil.',
);
}
if (!parsed.metadata || !parsed.scenes || !Array.isArray(parsed.scenes)) {
throw new InternalServerErrorException('AI yanıtı beklenen yapıda değil.');
}
if (parsed.scenes.length < 2) {
throw new InternalServerErrorException('AI en az 2 sahne üretmelidir.');
}
for (const scene of parsed.scenes) {
if (!scene.narrationText || !scene.visualPrompt) {
throw new InternalServerErrorException(
`Sahne ${scene.order}: narrationText ve visualPrompt zorunludur.`,
);
}
if (!scene.durationSeconds || scene.durationSeconds < 1) scene.durationSeconds = 5;
if (!scene.subtitleText) scene.subtitleText = scene.narrationText;
if (!scene.transitionType) scene.transitionType = 'CUT';
}
if (!parsed.musicPrompt) {
parsed.musicPrompt = 'Cinematic orchestral, mysterious, 80 BPM, minor key, strings and piano, slow ethereal build';
}
if (!parsed.musicStyle) {
parsed.musicStyle = 'cinematic-orchestral';
}
if (!parsed.musicTechnical) {
parsed.musicTechnical = {
bpm: 80,
key: 'C minor',
instruments: ['strings', 'piano', 'brass'],
emotionalArc: 'calm-to-building-to-resolve',
};
}
if (!parsed.ambientSoundPrompts) {
parsed.ambientSoundPrompts = [];
}
if (!parsed.voiceStyle) {
parsed.voiceStyle = 'Deep, authoritative male voice, warm tone, measured pacing for data, slight dramatic pauses for reveals';
}
return parsed;
}
}