main

2026-03-23 14:14:52 +03:00
parent 9bd2b4a2dd
commit c1e081478c
7 changed files with 1371 additions and 126 deletions
@@ -52,6 +52,7 @@
    "pino": "^10.1.0",
    "pino-http": "^11.0.0",
    "prisma": "^5.22.0",
+    "puppeteer": "^24.40.0",
    "reflect-metadata": "^0.2.2",
    "rxjs": "^7.8.1",
    "zod": "^4.3.5"
@@ -15,6 +15,7 @@ import { SeoModule } from '../seo/seo.module';
 import { NeuroMarketingModule } from '../neuro-marketing/neuro-marketing.module';
 import { GeminiModule } from '../gemini/gemini.module';
 import { VisualGenerationModule } from '../visual-generation/visual-generation.module';
+import { WebScraperService } from '../trends/services/web-scraper.service';


@Module({
@@ -28,6 +29,7 @@ import { VisualGenerationModule } from '../visual-generation/visual-generation.m
        HashtagService,
        BrandVoiceService,
        VariationService,
+        WebScraperService,
    ],
    controllers: [ContentGenerationController],
    exports: [ContentGenerationService],
@@ -14,11 +14,13 @@ import { SeoService, FullSeoAnalysis as SeoDTO } from '../seo/seo.service';
 import { NeuroMarketingService } from '../neuro-marketing/neuro-marketing.service';
 import { StorageService } from '../visual-generation/services/storage.service';
 import { VisualGenerationService } from '../visual-generation/visual-generation.service';
+import { WebScraperService, ScrapedContent } from '../trends/services/web-scraper.service';
 import { ContentType as PrismaContentType, ContentStatus as PrismaContentStatus, MasterContentType as PrismaMasterContentType } from '@prisma/client';


 export interface ContentGenerationRequest {
    topic: string;
+    sourceUrl?: string;
    niche?: string;
    platforms: Platform[];
    includeResearch?: boolean;
@@ -76,6 +78,7 @@ export class ContentGenerationService {
        private readonly neuroService: NeuroMarketingService,
        private readonly storageService: StorageService,
        private readonly visualService: VisualGenerationService,
+        private readonly webScraperService: WebScraperService,
    ) { }


@@ -87,6 +90,7 @@ export class ContentGenerationService {
    async generateContent(request: ContentGenerationRequest): Promise<GeneratedContentBundle> {
        const {
            topic,
+            sourceUrl,
            niche,
            platforms,
            includeResearch = true,
@@ -99,6 +103,26 @@ export class ContentGenerationService {

        console.log(`[ContentGenerationService] Starting generation for topic: ${topic}, platforms: ${platforms.join(', ')}`);

+        // ========== STEP 1: Scrape source article if URL provided ==========
+        let scrapedSource: ScrapedContent | null = null;
+        if (sourceUrl) {
+            this.logger.log(`Scraping source article: ${sourceUrl}`);
+            try {
+                scrapedSource = await this.webScraperService.scrapeUrl(sourceUrl, {
+                    extractImages: true,
+                    extractLinks: true,
+                    timeout: 15000,
+                }, topic);
+                if (scrapedSource) {
+                    this.logger.log(`Scraped source: ${scrapedSource.wordCount} words, ${scrapedSource.images.length} images, ${scrapedSource.videoLinks.length} videos`);
+                } else {
+                    this.logger.warn(`Failed to scrape source URL: ${sourceUrl}`);
+                }
+            } catch (err) {
+                this.logger.warn(`Source scraping error: ${err.message}`);
+            }
+        }
+
        // Analyze niche if provided
        let nicheAnalysis: NicheAnalysis | undefined;
        if (niche) {
@@ -116,6 +140,23 @@ export class ContentGenerationService {
            });
        }

+        // ========== Build enriched context from scraped source ==========
+        let sourceContext = '';
+        if (scrapedSource) {
+            const articleText = scrapedSource.content.substring(0, 3000);
+            const videoInfo = scrapedSource.videoLinks.length > 0
+                ? `\nVİDEO LİNKLERİ: ${scrapedSource.videoLinks.join(', ')}`
+                : '';
+            const importantLinks = scrapedSource.links
+                .filter(l => l.isExternal && !l.href.includes('facebook') && !l.href.includes('twitter'))
+                .slice(0, 5)
+                .map(l => `${l.text}: ${l.href}`)
+                .join('\n');
+            const linkInfo = importantLinks ? `\nÖNEMLİ LİNKLER:\n${importantLinks}` : '';
+
+            sourceContext = `\n\n📰 KAYNAK MAKALE İÇERİĞİ (ZORUNLU REFERANS):\n${articleText}${videoInfo}${linkInfo}\n\n⚠️ ÖNEMLİ: Yukarıdaki kaynak makaledeki TÜM özneleri (kişi, ürün, oyun adları, tarihler, fiyatlar, markalar) habere dahil et. Hiçbir önemli bilgiyi atlama. Video linkleri ve önemli dış linkler varsa bunları da içerikte paylaş.`;
+        }
+
        // Generate content for each platform using AI
        const platformContent: GeneratedContent[] = [];
        for (const platform of platforms) {
@@ -127,11 +168,13 @@ export class ContentGenerationService {
                const sanitizedSummary = this.sanitizeResearchSummary(
                    research?.summary || `Everything you need to know about ${topic}`
                );
+                // Append scraped source context to give AI the full article details
+                const enrichedSummary = sanitizedSummary + sourceContext;
                // Normalize platform to lowercase for consistency
                const normalizedPlatform = platform.toLowerCase();
                const aiContent = await this.platformService.generateAIContent(
                    topic,
-                    sanitizedSummary,
+                    enrichedSummary,
                    normalizedPlatform as any, // Cast to any/Platform to resolve type mismatch if Platform is strict union
                    'standard',
                    'tr',
@@ -145,6 +188,9 @@ export class ContentGenerationService {
                    this.logger.warn(`AI Content is empty for ${platform}`);
                }

+                // Use scraped image from source if available
+                const sourceImageUrl = scrapedSource?.images?.[0]?.src || undefined;
+
                const config = this.platformService.getPlatformConfig(platform);
                let content: GeneratedContent = {
                    platform,
@@ -163,10 +209,19 @@ export class ContentGenerationService {
                    content.content = voiceApplied.branded;
                }

-                // Add hashtags if requested
+                // Add hashtags using AI (based on actual generated content)
                if (includeHashtags) {
-                    const hashtagSet = this.hashtagService.generateHashtags(topic, platform);
-                    content.hashtags = hashtagSet.hashtags.map((h) => h.hashtag);
+                    try {
+                        content.hashtags = await this.platformService.generateAIHashtags(
+                            content.content,
+                            topic,
+                            platform as any,
+                            'tr',
+                        );
+                    } catch (hashErr) {
+                        this.logger.warn(`AI hashtag generation failed, skipping: ${hashErr.message}`);
+                        content.hashtags = [];
+                    }
                }

                // Generate image for visual platforms
@@ -180,11 +235,31 @@ export class ContentGenerationService {
                            platform: platformKey,
                            enhancePrompt: true,
                        });
-                        content.imageUrl = image.url;
-                        this.logger.log(`Image generated for ${platform}: ${image.url}`);
+                        
+                        // Check if image is a real image or just a placeholder
+                        const isPlaceholder = image.url?.includes('placehold.co') || image.url?.includes('placeholder');
+                        if (!isPlaceholder) {
+                            content.imageUrl = image.url;
+                            this.logger.log(`Image generated for ${platform}: ${image.url}`);
+                        } else if (sourceImageUrl) {
+                            // Use scraped source image instead of placeholder
+                            content.imageUrl = sourceImageUrl;
+                            this.logger.log(`Using scraped source image instead of placeholder: ${sourceImageUrl}`);
+                        } else {
+                            content.imageUrl = image.url;
+                            this.logger.log(`Image generated for ${platform}: ${image.url} (placeholder, no source image available)`);
+                        }
                    } catch (imgError) {
                        this.logger.warn(`Image generation failed for ${platform}, continuing without image`, imgError);
+                        // Fallback to scraped source image
+                        if (sourceImageUrl) {
+                            content.imageUrl = sourceImageUrl;
+                            this.logger.log(`Using scraped source image as fallback: ${sourceImageUrl}`);
+                        }
                    }
+                } else if (sourceImageUrl && !content.imageUrl) {
+                    // For non-visual platforms, still attach source image if available
+                    content.imageUrl = sourceImageUrl;
                }

                platformContent.push(content);
@@ -358,7 +433,7 @@ export class ContentGenerationService {
                            userId: effectiveUserId!,
                            masterContentId: masterContent.id,
                            type: contentType,
-                            title: `${bundle.topic} - ${platformContent.platform}`,
+                            title: this.sanitizeResearchSummary(`${bundle.topic}`) + ` - ${platformContent.platform}`,
                            body: platformContent.content,
                            hashtags: platformContent.hashtags,
                            status: PrismaContentStatus.DRAFT,
@@ -548,6 +623,8 @@ KURALLAR:
 6. Karakter limitini koru
 7. Platformun tonuna uygun yaz
 8. SADECE yayınlanacak metni yaz
+9. Hiçbir haber sitesi, kaynak, ajans veya web sitesi adı kullanma
+10. "...göre", "...haberlere göre", "...kaynağına göre" gibi atıf ifadeleri ASLA kullanma

 SADECE yeniden yazılmış metni döndür, açıklama ekleme.`;

@@ -589,25 +666,43 @@ SADECE yeniden yazılmış metni döndür, açıklama ekleme.`;
        sanitized = sanitized.replace(/https?:\/\/[^\s]+/gi, '');
        sanitized = sanitized.replace(/www\.[^\s]+/gi, '');

-        // Remove common Turkish attribution phrases
+        // Remove common attribution phrases (Turkish and English)
        const attributionPatterns = [
            /\b\w+\.com(\.tr)?\b/gi,
            /\b\w+\.org(\.tr)?\b/gi,
            /\b\w+\.net(\.tr)?\b/gi,
            /\bkaynağına göre\b/gi,
            /\b'e göre\b/gi,
+            /\b'(i|a|e|u|ü|\u0131)n(da|de) (yayınlanan|yer alan|çıkan)\b/gi,
+            /\b(da|de) (çıkan|yayınlanan|yer alan) (haberlere|habere|bilgilere) göre\b/gi,
+            /\bhaberlere göre\b/gi,
+            /\braporuna göre\b/gi,
+            /\bsitesinde yer alan\b/gi,
+            /\baçıklamasına göre\b/gi,
+            /\byazısına göre\b/gi,
+            /\bhaberine göre\b/gi,
+            /\btarafından yapılan\b/gi,
            /\baccording to [^,.]+/gi,
+            /\breported by [^,.]+/gi,
+            /\bas reported in [^,.]+/gi,
            /\bsource:\s*[^,.]+/gi,
            /\breferans:\s*[^,.]+/gi,
            /\bkaynak:\s*[^,.]+/gi,
        ];

-        // Common Turkish tech/news source brands to strip
+        // Comprehensive list of Turkish tech/news source brands to strip
        const sourceNames = [
-            'donanımhaber', 'technopat', 'webtekno', 'shiftdelete',
+            'tamindir', 'donanımhaber', 'technopat', 'webtekno', 'shiftdelete',
            'chip online', 'log.com', 'mediatrend', 'bbc', 'cnn',
            'reuters', 'anadolu ajansı', 'hürriyet', 'milliyet',
            'sabah', 'forbes', 'bloomberg', 'techcrunch',
+            'the verge', 'engadget', 'ars technica', 'wired',
+            'mashable', 'gizmodo', 'tom\'s hardware', 'tom\'s guide',
+            'ntv', 'habertürk', 'sozcu', 'sözcü', 'cumhuriyet', 'star',
+            'posta', 'aksam', 'yeni safak', 'yeni şafak', 'takvim',
+            'mynet', 'ensonhaber', 'haber7', 'internethaber',
+            'ad hoc news', 'finanzen.net', 'der aktionär', 'aktionar',
+            'business insider', 'cnbc', 'financial times', 'wall street journal',
        ];

        for (const pattern of attributionPatterns) {
@@ -615,12 +710,15 @@ SADECE yeniden yazılmış metni döndür, açıklama ekleme.`;
        }

        for (const source of sourceNames) {
-            const regex = new RegExp(`\\b${source}\\b`, 'gi');
+            const regex = new RegExp(`\\b${source.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, 'gi');
            sanitized = sanitized.replace(regex, '');
        }

-        // Clean up multiple spaces and trailing commas
-        sanitized = sanitized.replace(/\s{2,}/g, ' ').replace(/,\s*,/g, ',').trim();
+        // Also remove "- site_name" patterns from titles (e.g. "Great News - Tamindir")
+        sanitized = sanitized.replace(/\s*-\s*$/gm, '');
+
+        // Clean up multiple spaces, trailing commas, and orphaned punctuation
+        sanitized = sanitized.replace(/\s{2,}/g, ' ').replace(/,\s*,/g, ',').replace(/\s+([.,;:!?])/g, '$1').trim();

        return sanitized;
    }
@@ -502,21 +502,39 @@ TON: ${config.tone}${styleInstruction}${ctaInstruction}

 Bu platform için özgün, ilgi çekici ve viral potansiyeli yüksek bir içerik oluştur.

-KURALLAR:
+📈 SEO OPTİMİZASYONU (ZORUNLU):
+- Bu konuyu Google'da, YouTube'da veya sosyal medyada arayan biri hangi kelimeleri kullanır? O kelimeleri belirle ve içeriğe yerleştir.
+- İLK 2 CÜMLEDE arama hacmi en yüksek anahtar kelimeleri MUTLAKA kullan.
+- Hook/giriş cümlesi birincil anahtar kelimeyi içersin.
+- Anahtar kelimeleri doğal bir akış içinde kullan, zoraki tekrar yapma.
+- Konu ile ilgili en çok aranan terimleri, teknik terimleri ve marka/ürün adlarını (haberin konusu olan markaları — kaynak değil) ön plana çıkar.
+
+KRİTİK KURALLAR:
 1. Karakter limitine uy
 2. Platformun tonuna uygun yaz
 3. Hook (dikkat çeken giriş) ile başla
 4. CTA ile bitir (yukarıdaki CTA talimatına göre)
 5. Emoji kullan ama aşırıya kaçma
 6. ${language === 'tr' ? 'Türkçe' : 'İngilizce'} yaz
-7. ASLA resim URL'i, medya linki veya [görsel] gibi yer tutucular ekleme
-8. Görsel betimlemeleri metnin içine YAZMA
-9. İçerik %100 özgün olmalı - asla kaynak kopyası yapma
-10. Kaynak linklerini, URL'leri veya atıfları ASLA ekleme
-11. Mevcut içeriklerden alıntı yapma, tamamen yeni ve orijinal yaz
-12. Bilgiyi kendi cümlelerinle ifade et, paraphrase bile yapma
-13. Araştırma kaynaklarının isimlerini (web siteleri, haber siteleri, markalar, gazeteler) ASLA metinde kullanma veya referans verme
-14. "...göre", "...kaynağına göre", "according to" gibi atıf ifadeleri ASLA kullanma
+7. İçerik %100 özgün olmalı - asla kaynak kopyası yapma
+8. Bilgiyi kendi cümlelerinle ifade et, paraphrase bile yapma
+
+⚠️ YAYIN HAZIR İÇERİK (ÇOK ÖNEMLİ):
+- İçerik doğrudan kopyala-yapıştır ile yayınlanabilir olmalı
+- "[Buraya Link]", "[Link Ekle]", "[URL]", "[Görsel]", "[Video]" gibi YER TUTUCU İFADELER ASLA kullanma
+- Resim URL'i, medya linki veya placeholder ASLA ekleme
+- Görsel betimlemeleri metnin içine YAZMA
+- "Linke tıklayın", "Bio'daki linke gidin" gibi CTA'lar kullanabilirsin ama asla köşeli parantez içinde placeholder koyma
+- Eğer bir link veya URL bilmiyorsan, o kısmı tamamen atla — placeholder bırakma
+- İçerikte doldurulması gereken boşluk OLMAMALI
+
+⛔ KAYNAK YASAĞI (EN ÖNEMLİ KURAL):
+- Hiçbir haber sitesi, web sitesi, gazete, ajans, blog veya medya kuruluşu adını ASLA yazma
+- "Tamindir", "Webtekno", "DonanımHaber", "ShiftDelete", "TechCrunch", "BBC", "CNN", "Reuters", "Forbes", "Bloomberg" gibi site/kaynak adlarını ASLA kullanma
+- "...haberlere göre", "...raporuna göre", "...kaynağına göre", "...sitesinde yer alan", "...çıkan haberlere göre", "according to", "...tarafından yapılan" gibi ATıF İFADELERİ ASLA kullanma
+- Haberin nereden alındığını BELİRTME, doğrudan bilgiyi kendi cümlelerinle anlat
+- İçerikte kaynak gösterme, referans verme veya atıf yapma YOK
+- Bilgi/veri paylaşırken kaynağı belirtmeden doğrudan bilgiyi ver

 SADECE yayınlanacak metni yaz, açıklama veya başlık ekleme.`;

@@ -532,6 +550,107 @@ SADECE yayınlanacak metni yaz, açıklama veya başlık ekleme.`;
        }
    }

+    /**
+     * Generate relevant, SEO-optimized hashtags using AI
+     * Replaces the old mock-based hashtag generation
+     */
+    async generateAIHashtags(
+        content: string,
+        topic: string,
+        platform: Platform,
+        language: string = 'tr',
+    ): Promise<string[]> {
+        const config = this.platforms[platform];
+        if (!config || config.maxHashtags === 0) return [];
+
+        if (!this.gemini.isAvailable()) {
+            this.logger.warn('Gemini not available for hashtag generation, using fallback');
+            return this.generateFallbackHashtags(topic, config.maxHashtags);
+        }
+
+        const maxCount = Math.min(config.maxHashtags, platform === 'instagram' ? 15 : config.maxHashtags);
+
+        const prompt = `Sen bir sosyal medya SEO uzmanısın. Aşağıdaki içerik ve konu için ${platform.toUpperCase()} platformunda kullanılacak EN UYGUN ${maxCount} hashtag üret.
+
+KONU: ${topic}
+İÇERİK:
+${content.substring(0, 500)}
+
+HASHTAG KURALLARI:
+1. Her hashtag DOĞRUDAN içerikle ilgili olmalı — genel veya ilişkisiz hashtag OLMASIN
+2. Arama hacmi yüksek, gerçek kullanıcıların arayacağı kelimelerden oluştur
+3. Konunun ana terimleri, teknik terimleri, marka/ürün adları (haberin konusu olanlar) ve sektör terimleri olsun
+4. "tips", "howto", "life", "community", "motivation", "goals" gibi genel son ekler KULLANMA
+5. ${language === 'tr' ? 'Türkçe ve İngilizce karışık olabilir, hangisi daha çok aranıyorsa onu seç' : 'Use English hashtags'}
+6. Hashtag'ı # ile başlat
+7. Tek kelime veya kısa bileşik kelimeler kullan (boşluk yok)
+8. Haber kaynağı olan sitelerin adlarını (Webtekno, Tamindir, DonanımHaber, ShiftDelete, TechCrunch, BBC, CNN vb.) ASLA hashtag olarak kullanma — bunlar bizim kaynağımız, içeriğimiz değil
+
+SADECE hashtag listesini döndür, her satırda bir hashtag. Başka açıklama ekleme.
+ÖRNEK FORMAT:
+#hashtag1
+#hashtag2
+#hashtag3`;
+
+        try {
+            const response = await this.gemini.generateText(prompt, {
+                temperature: 0.4,
+                maxTokens: 200,
+            });
+
+            // Banned source names that should never appear as hashtags
+            const bannedSources = [
+                'tamindir', 'webtekno', 'donanımhaber', 'donanımhaber', 'shiftdelete',
+                'technopat', 'chipsonline', 'chiponline', 'mediatrend', 'hürriyet',
+                'milliyet', 'sabah', 'ntv', 'habertürk', 'sözcü', 'sozcu',
+                'cumhuriyet', 'posta', 'aksam', 'takvim', 'mynet', 'ensonhaber',
+                'haber7', 'internethaber', 'bbc', 'cnn', 'reuters', 'forbes',
+                'bloomberg', 'techcrunch', 'theverge', 'engadget', 'wired',
+                'gizmodo', 'mashable', 'businessinsider', 'cnbc', 'adhoçnews',
+                'finanzennet', 'deraktionär', 'aktionar',
+            ];
+
+            const hashtags = response.text
+                .split('\n')
+                .map(line => line.trim())
+                .filter(line => line.startsWith('#') && line.length > 1)
+                .map(tag => tag.replace(/\s+/g, ''))
+                .filter(tag => {
+                    const cleanTag = tag.replace('#', '').toLowerCase();
+                    return !bannedSources.some(source =>
+                        cleanTag === source || cleanTag === source.replace(/\s/g, '')
+                    );
+                })
+                .slice(0, maxCount);
+
+            if (hashtags.length === 0) {
+                this.logger.warn('AI returned no valid hashtags, using fallback');
+                return this.generateFallbackHashtags(topic, maxCount);
+            }
+
+            this.logger.log(`AI generated ${hashtags.length} hashtags for ${platform}: ${hashtags.join(', ')}`);
+            return hashtags;
+        } catch (error) {
+            this.logger.error(`AI hashtag generation failed: ${error.message}`);
+            return this.generateFallbackHashtags(topic, maxCount);
+        }
+    }
+
+    /**
+     * Fallback hashtag generation when AI is unavailable
+     * Extracts meaningful words from the topic instead of appending generic suffixes
+     */
+    private generateFallbackHashtags(topic: string, maxCount: number): string[] {
+        const stopWords = new Set(['ve', 'ile', 'bir', 'bu', 'için', 'da', 'de', 'the', 'a', 'an', 'and', 'or', 'for', 'in', 'on', 'is', 'are', 'was', 'of', 'to']);
+        return topic
+            .toLowerCase()
+            .replace(/[^a-zçğıöşü\w\s]/gi, '')
+            .split(/\s+/)
+            .filter(w => w.length > 3 && !stopWords.has(w))
+            .slice(0, maxCount)
+            .map(w => `#${w}`);
+    }
+
    private generateTemplateContent(
        topic: string,
        mainMessage: string,
@@ -270,7 +270,7 @@ export class ContentOptimizationService {
        if (!keyword) return 50;

        const words = content.split(/\s+/).length;
-        const kwCount = (content.toLowerCase().match(new RegExp(keyword.toLowerCase(), 'g')) || []).length;
+        const kwCount = content.toLowerCase().split(keyword.toLowerCase()).length - 1;
        const density = (kwCount / words) * 100;

        if (density >= this.optimalParams.keywordDensity.min &&
@@ -2,6 +2,7 @@
 // Path: src/modules/trends/services/web-scraper.service.ts

 import { Injectable, Logger } from '@nestjs/common';
+import * as puppeteer from 'puppeteer';

 export interface ScrapedContent {
    url: string;
@@ -12,6 +13,7 @@ export interface ScrapedContent {
    headings: { level: number; text: string }[];
    links: { text: string; href: string; isExternal: boolean }[];
    images: { src: string; alt: string }[];
+    videoLinks: string[];
    metadata: {
        author?: string;
        publishDate?: string;
@@ -63,37 +65,362 @@ export interface ScraperOptions {
 export class WebScraperService {
    private readonly logger = new Logger(WebScraperService.name);
    private readonly contentCache = new Map<string, ScrapedContent>();
-    private readonly defaultUserAgent = 'ContentHunter/1.0 (Research Bot)';
+    private readonly defaultUserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';

    /**
-     * Scrape content from a web page
+     * Scrape content from a web page.
+     * Automatically resolves Google News redirect URLs.
+     * @param url The URL to scrape
+     * @param options Scraper options
+     * @param articleTitle The article title (used for Google News URL resolution via search)
     */
-    async scrapeUrl(url: string, options?: ScraperOptions): Promise<ScrapedContent | null> {
+    async scrapeUrl(url: string, options?: ScraperOptions, articleTitle?: string): Promise<ScrapedContent | null> {
        // Validate URL
        if (!this.isValidUrl(url)) {
            this.logger.warn(`Invalid URL: ${url}`);
            return null;
        }

+        // Resolve Google News redirect URLs to actual article URLs
+        let resolvedUrl = url;
+        if (url.includes('news.google.com') || url.includes('google.com/rss')) {
+            this.logger.log(`Detected Google News URL: ${url}`);
+            
+            // Strategy 1: Use Puppeteer headless browser to follow JS redirects (most reliable)
+            const puppeteerResult = await this.resolveGoogleNewsWithPuppeteer(url);
+            if (puppeteerResult) {
+                resolvedUrl = puppeteerResult;
+                this.logger.log(`Puppeteer resolved Google News URL to: ${resolvedUrl}`);
+            } else {
+                // Strategy 2: Fall back to DuckDuckGo title search
+                this.logger.warn('Puppeteer resolution failed, trying DuckDuckGo title search...');
+                if (articleTitle) {
+                    const searchResult = await this.findArticleByTitle(articleTitle);
+                    if (searchResult) {
+                        resolvedUrl = searchResult;
+                        this.logger.log(`DuckDuckGo found article URL: ${resolvedUrl}`);
+                    } else {
+                        this.logger.warn('Both Puppeteer and DuckDuckGo failed. Cannot resolve Google News URL.');
+                        return null;
+                    }
+                } else {
+                    this.logger.warn('No article title provided for fallback search. Cannot resolve Google News URL.');
+                    return null;
+                }
+            }
+        }
+
        // Check cache
-        const cached = this.contentCache.get(url);
+        const cached = this.contentCache.get(resolvedUrl);
        if (cached && this.isCacheValid(cached)) {
            return cached;
        }

        try {
-            const response = await this.fetchPage(url, options);
-            if (!response) return null;
+            this.logger.log(`Scraping URL: ${resolvedUrl}`);
+            const response = await this.fetchPage(resolvedUrl, options);
+            if (!response) {
+                this.logger.warn(`fetchPage returned null for ${resolvedUrl}`);
+                return null;
+            }

-            const content = this.parseHtml(response.html, url, options);
+            this.logger.log(`Fetched HTML: ${response.html.length} chars`);
+            const content = this.parseHtml(response.html, resolvedUrl, options);
            content.html = options?.includeHtml ? response.html : '';

            // Cache the result
-            this.contentCache.set(url, content);
+            this.contentCache.set(resolvedUrl, content);

+            this.logger.log(`Scraped successfully: ${content.title}, ${content.images.length} images, ${content.videoLinks.length} videos, ${content.wordCount} words`);
            return content;
        } catch (error) {
-            this.logger.error(`Failed to scrape ${url}:`, error);
+            this.logger.error(`Failed to scrape ${resolvedUrl}:`, error);
+            return null;
+        }
+    }
+
+    /**
+     * Resolve a Google News redirect URL to the actual article URL using Puppeteer.
+     * Google News uses JavaScript-only redirects that cannot be followed via HTTP.
+     * Puppeteer launches a headless browser to follow the redirect.
+     */
+    private async resolveGoogleNewsWithPuppeteer(googleNewsUrl: string): Promise<string | null> {
+        let browser: puppeteer.Browser | null = null;
+        try {
+            this.logger.log(`Launching Puppeteer to resolve Google News URL...`);
+            
+            browser = await puppeteer.launch({
+                headless: true,
+                args: [
+                    '--no-sandbox',
+                    '--disable-setuid-sandbox',
+                    '--disable-dev-shm-usage',
+                    '--disable-gpu',
+                    '--disable-extensions',
+                    '--disable-background-networking',
+                    '--window-size=1280,720',
+                ],
+                timeout: 20000,
+            });
+
+            const page = await browser.newPage();
+            
+            // Set a realistic user agent
+            await page.setUserAgent(this.defaultUserAgent);
+            
+            // Block unnecessary resources to speed up loading
+            await page.setRequestInterception(true);
+            page.on('request', (request) => {
+                const resourceType = request.resourceType();
+                if (['image', 'stylesheet', 'font', 'media'].includes(resourceType)) {
+                    request.abort();
+                } else {
+                    request.continue();
+                }
+            });
+
+            // Navigate to the Google News URL
+            await page.goto(googleNewsUrl, {
+                waitUntil: 'networkidle2',
+                timeout: 15000,
+            });
+
+            // Wait for the redirect to complete — check periodically if URL changed
+            let finalUrl = page.url();
+            const startTime = Date.now();
+            const maxWait = 10000; // 10 seconds max
+
+            while (finalUrl.includes('news.google.com') && (Date.now() - startTime) < maxWait) {
+                await new Promise(resolve => setTimeout(resolve, 500));
+                finalUrl = page.url();
+            }
+
+            await browser.close();
+            browser = null;
+
+            // Check if we successfully left Google News
+            if (!finalUrl.includes('news.google.com') && !finalUrl.includes('consent.google.com')) {
+                this.logger.log(`Puppeteer resolved to: ${finalUrl}`);
+                return finalUrl;
+            } else {
+                this.logger.warn(`Puppeteer could not resolve - still on Google domain: ${finalUrl}`);
+                return null;
+            }
+        } catch (error) {
+            this.logger.warn(`Puppeteer resolution failed: ${error.message}`);
+            return null;
+        } finally {
+            if (browser) {
+                try { await browser.close(); } catch (e) { /* ignore */ }
+            }
+        }
+    }
+
+    /**
+     * Find actual article URL by searching for the article title.
+     * Extracts the source name from the title (e.g., "Title - Webtekno" → searches "site:webtekno.com Title")
+     * Uses DuckDuckGo HTML search (more bot-friendly than Google)
+     */
+    private async findArticleByTitle(title: string): Promise<string | null> {
+        try {
+            // Extract source name from title (usually at the end after " - ")
+            const parts = title.split(/\s+-\s+/);
+            const sourceName = parts.length > 1 ? parts[parts.length - 1].trim() : '';
+            const cleanTitle = parts.length > 1 ? parts.slice(0, -1).join(' - ').trim() : title;
+
+            // Remove brackets and special chars from title for better search
+            const searchableTitle = cleanTitle
+                .replace(/\[.*?\]/g, '')
+                .replace(/[^\w\s\u00C0-\u024F\u0100-\u017F\u011E-\u011F\u0130-\u0131\u015E-\u015F\u00D6\u00F6\u00DC\u00FC\u00C7\u00E7]/g, ' ')
+                .replace(/\s+/g, ' ')
+                .trim();
+
+            // Build search query — prefer site: filter if we know the source
+            const sourceDomain = sourceName.toLowerCase().replace(/\s+/g, '');
+            const siteFilter = sourceName ? `${sourceDomain}.com ` : '';
+            const searchQuery = `${siteFilter}${searchableTitle}`;
+
+            this.logger.log(`Searching DuckDuckGo for article: ${searchQuery}`);
+
+            // Use DuckDuckGo HTML search (more bot-friendly)
+            const searchUrl = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(searchQuery)}`;
+            const response = await fetch(searchUrl, {
+                headers: {
+                    'User-Agent': this.defaultUserAgent,
+                    'Accept': 'text/html,application/xhtml+xml',
+                    'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
+                },
+            });
+
+            if (!response.ok) {
+                this.logger.warn(`DuckDuckGo search returned ${response.status}`);
+                return null;
+            }
+
+            const html = await response.text();
+
+            // DuckDuckGo HTML results contain article URLs in uddg= parameters
+            const ddgPattern = /uddg=(https?[^&"]+)/g;
+            const foundUrls: string[] = [];
+            const seen = new Set<string>();
+            let match;
+            while ((match = ddgPattern.exec(html)) !== null) {
+                const foundUrl = decodeURIComponent(match[1]);
+                if (!seen.has(foundUrl) && !foundUrl.includes('duckduckgo.com') && !foundUrl.includes('google.com')) {
+                    seen.add(foundUrl);
+                    foundUrls.push(foundUrl);
+                }
+            }
+
+            if (foundUrls.length > 0) {
+                // Prefer URLs matching the source name
+                if (sourceName) {
+                    const sourceUrl = foundUrls.find(u => u.toLowerCase().includes(sourceDomain));
+                    if (sourceUrl) {
+                        this.logger.log(`Found matching source URL via DuckDuckGo: ${sourceUrl}`);
+                        return sourceUrl;
+                    }
+                }
+                this.logger.log(`Using first DuckDuckGo result: ${foundUrls[0]}`);
+                return foundUrls[0];
+            }
+
+            this.logger.warn('No search results found for article title');
+            return null;
+        } catch (error) {
+            this.logger.warn(`Article search failed: ${error.message}`);
+            return null;
+        }
+    }
+
+    /**
+     * Resolve redirect URLs (especially Google News) to the final destination URL.
+     * Google News RSS URLs encode the actual article URL in a base64 segment in the path.
+     */
+    private async resolveRedirectUrl(url: string): Promise<string | null> {
+        // Strategy 1: Decode Google News base64-encoded URL from the path
+        try {
+            const decoded = this.decodeGoogleNewsUrl(url);
+            if (decoded) {
+                this.logger.log(`Decoded Google News URL: ${decoded}`);
+                return decoded;
+            }
+        } catch (e) {
+            this.logger.warn(`Base64 decode failed: ${e.message}`);
+        }
+
+        // Strategy 2: Follow HTTP redirects
+        try {
+            // First try with redirect: 'manual' to get Location header
+            const headResponse = await fetch(url, {
+                method: 'HEAD',
+                headers: {
+                    'User-Agent': this.defaultUserAgent,
+                    'Accept': 'text/html',
+                },
+                redirect: 'manual',
+            });
+
+            const locationHeader = headResponse.headers.get('location');
+            if (locationHeader && !locationHeader.includes('news.google.com')) {
+                this.logger.log(`Redirect Location header: ${locationHeader}`);
+                return locationHeader;
+            }
+
+            // Try full GET with redirect follow
+            const getResponse = await fetch(url, {
+                method: 'GET',
+                headers: {
+                    'User-Agent': this.defaultUserAgent,
+                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+                },
+                redirect: 'follow',
+            });
+
+            if (getResponse.url && !getResponse.url.includes('news.google.com')) {
+                return getResponse.url;
+            }
+
+            // Strategy 3: Parse HTML for article link / meta refresh / canonical
+            const html = await getResponse.text();
+
+            // Check data-redirect attribute
+            const dataRedirect = html.match(/data-(?:redirect|href|url)=["'](https?:\/\/(?!news\.google\.com)[^"']+)["']/i);
+            if (dataRedirect) return dataRedirect[1];
+
+            // Meta refresh
+            const metaRefresh = html.match(/<meta[^>]*http-equiv=["']refresh["'][^>]*content=["']\d+;\s*url=([^"']+)["']/i);
+            if (metaRefresh) return metaRefresh[1];
+
+            // Canonical
+            const canonical = html.match(/<link[^>]*rel=["']canonical["'][^>]*href=["']([^"']+)["']/i);
+            if (canonical && !canonical[1].includes('news.google.com')) return canonical[1];
+
+            // Any external link that looks like an article
+            const externalLink = html.match(/href=["'](https?:\/\/(?!(?:news|www)\.google\.com)[^"']+)["']/i);
+            if (externalLink) return externalLink[1];
+
+            this.logger.warn(`Could not resolve Google News URL, returning original`);
+            return getResponse.url;
+        } catch (error) {
+            this.logger.warn(`Failed to resolve redirect for ${url}: ${error.message}`);
+            return null;
+        }
+    }
+
+    /**
+     * Decode actual article URL from Google News RSS URL.
+     * Google News encodes URLs in the path segment as base64.
+     * Format: https://news.google.com/rss/articles/CBMi{base64payload}
+     */
+    private decodeGoogleNewsUrl(googleUrl: string): string | null {
+        try {
+            // Extract the base64 part from the URL path
+            const urlObj = new URL(googleUrl);
+            const pathParts = urlObj.pathname.split('/');
+            // Find the article ID part (after /articles/)
+            const articlesIndex = pathParts.indexOf('articles');
+            if (articlesIndex === -1 || articlesIndex + 1 >= pathParts.length) {
+                return null;
+            }
+
+            let articleId = pathParts[articlesIndex + 1];
+            // Remove query params if they got attached
+            if (articleId.includes('?')) {
+                articleId = articleId.split('?')[0];
+            }
+
+            // The article ID starts with "CBMi" prefix, try to decode the base64
+            // Make base64 URL-safe: replace - with + and _ with /
+            let base64 = articleId
+                .replace(/-/g, '+')
+                .replace(/_/g, '/');
+
+            // Add padding if needed
+            while (base64.length % 4 !== 0) {
+                base64 += '=';
+            }
+
+            // Decode base64
+            const decoded = Buffer.from(base64, 'base64').toString('utf-8');
+
+            // Extract URLs from the decoded string using regex
+            const urlMatches = decoded.match(/https?:\/\/[^\s"'<>\x00-\x1F]+/g);
+            if (urlMatches && urlMatches.length > 0) {
+                // Filter out Google URLs
+                const nonGoogleUrl = urlMatches.find(u => !u.includes('google.com'));
+                if (nonGoogleUrl) {
+                    // Clean up any trailing garbage characters
+                    const cleanUrl = nonGoogleUrl.replace(/[\x00-\x1F\x7F-\x9F]+.*$/, '');
+                    this.logger.log(`Decoded article URL from base64: ${cleanUrl}`);
+                    return cleanUrl;
+                }
+                return urlMatches[0];
+            }
+
+            return null;
+        } catch (error) {
+            this.logger.warn(`Failed to decode Google News URL: ${error.message}`);
            return null;
        }
    }
@@ -129,64 +456,57 @@ export class WebScraperService {
    }

    /**
-     * Fetch page content (simulated)
+     * Fetch page content using real HTTP fetch
     */
    private async fetchPage(url: string, options?: ScraperOptions): Promise<{ html: string } | null> {
-        // In production, use:
-        // 1. node-fetch or axios for simple pages
-        // 2. Puppeteer/Playwright for JavaScript-rendered pages
-        // 3. Cheerio for HTML parsing
+        const timeout = options?.timeout || 15000;
+        const userAgent = options?.userAgent || this.defaultUserAgent;

-        // Simulated HTML for demonstration
-        const mockHtml = `
-      <!DOCTYPE html>
-      <html>
-      <head>
-        <title>Sample Article: Content Creation Strategies</title>
-        <meta name="description" content="Learn the best content creation strategies for 2024">
-        <meta name="author" content="John Doe">
-        <meta name="keywords" content="content, creation, marketing, strategy">
-        <meta property="og:title" content="Content Creation Strategies">
-        <meta property="og:description" content="Master content creation with these proven strategies">
-        <meta property="og:image" content="https://example.com/image.jpg">
-      </head>
-      <body>
-        <article>
-          <h1>10 Content Creation Strategies for 2024</h1>
-          <p class="author">By John Doe | Published: January 15, 2024</p>
+        try {
+            const controller = new AbortController();
+            const timeoutId = setTimeout(() => controller.abort(), timeout);

-          <h2>Introduction</h2>
-          <p>Content creation has evolved significantly over the past year. In this comprehensive guide, we'll explore the most effective strategies for creating engaging content.</p>
+            const response = await fetch(url, {
+                headers: {
+                    'User-Agent': userAgent,
+                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+                    'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
+                    'Accept-Encoding': 'identity',
+                },
+                signal: controller.signal,
+                redirect: 'follow',
+            });

-          <h2>1. Focus on Value First</h2>
-          <p>The most successful content creators prioritize providing value to their audience. According to a recent study, 78% of consumers prefer brands that create custom content.</p>
+            clearTimeout(timeoutId);

-          <h2>2. Embrace Short-Form Video</h2>
-          <p>Short-form video continues to dominate. TikTok and Instagram Reels have shown that 15-60 second videos can generate massive engagement.</p>
+            if (!response.ok) {
+                this.logger.warn(`HTTP ${response.status} for ${url}`);
+                return null;
+            }

-          <blockquote>"Content is king, but distribution is queen." - Gary Vaynerchuk</blockquote>
+            const contentType = response.headers.get('content-type') || '';
+            if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
+                this.logger.warn(`Non-HTML content type: ${contentType} for ${url}`);
+                return null;
+            }

-          <h2>3. Use AI Wisely</h2>
-          <p>AI tools like ChatGPT and Claude can help with ideation and drafting, but human creativity remains essential for authentic content.</p>
+            const html = await response.text();

-          <h3>Key Statistics</h3>
-          <ul>
-            <li>85% of marketers use content marketing</li>
-            <li>Video content generates 1200% more shares</li>
-            <li>Long-form content gets 77% more backlinks</li>
-          </ul>
+            if (!html || html.length < 100) {
+                this.logger.warn(`Empty or very short response from ${url}`);
+                return null;
+            }

-          <h2>Conclusion</h2>
-          <p>Success in content creation requires a balance of strategy, creativity, and consistency. Start implementing these strategies today!</p>
-          
-          <a href="/related-article">Read more articles</a>
-          <a href="https://external.com/resource">External resource</a>
-        </article>
-      </body>
-      </html>
-    `;
-
-        return { html: mockHtml };
+            this.logger.log(`Successfully fetched ${url} (${html.length} chars)`);
+            return { html };
+        } catch (error) {
+            if (error.name === 'AbortError') {
+                this.logger.warn(`Request timed out for ${url}`);
+            } else {
+                this.logger.error(`Failed to fetch ${url}: ${error.message}`);
+            }
+            return null;
+        }
    }

    /**
@@ -194,6 +514,7 @@ export class WebScraperService {
     */
    private parseHtml(html: string, url: string, options?: ScraperOptions): ScrapedContent {
        const domain = new URL(url).hostname;
+        const baseUrl = new URL(url).origin;

        // Extract title
        const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
@@ -212,8 +533,11 @@ export class WebScraperService {
        // Extract links
        const links = options?.extractLinks !== false ? this.extractLinks(html, domain) : [];

-        // Extract images
-        const images = options?.extractImages !== false ? this.extractImages(html) : [];
+        // Extract images with absolute URL resolution
+        const images = options?.extractImages !== false ? this.extractImages(html, baseUrl) : [];
+
+        // Extract video links (YouTube, etc.)
+        const videoLinks = this.extractVideoLinks(html);

        // Extract metadata
        const metadata = this.extractMetadata(html);
@@ -232,6 +556,7 @@ export class WebScraperService {
            headings,
            links,
            images,
+            videoLinks,
            metadata,
            wordCount,
            readingTime,
@@ -306,21 +631,82 @@ export class WebScraperService {
    }

    /**
-     * Extract images from HTML
+     * Extract images from HTML with absolute URL resolution
     */
-    private extractImages(html: string): { src: string; alt: string }[] {
+    private extractImages(html: string, baseUrl?: string): { src: string; alt: string }[] {
        const images: { src: string; alt: string }[] = [];
+        // Match src before alt, or alt before src
        const regex = /<img[^>]*src=["']([^"']+)["'][^>]*(?:alt=["']([^"']*)["'])?/gi;
+        const regex2 = /<img[^>]*alt=["']([^"']*)["'][^>]*src=["']([^"']+)["']/gi;
        let match;

+        const addImage = (src: string, alt: string) => {
+            // Skip tiny tracking pixels, icons, and data URIs
+            if (src.includes('1x1') || src.includes('pixel') || src.includes('data:image/gif')) return;
+            if (src.endsWith('.svg') || src.endsWith('.ico')) return;
+
+            // Resolve relative URLs
+            let resolvedSrc = src;
+            if (baseUrl && !src.startsWith('http') && !src.startsWith('//')) {
+                resolvedSrc = src.startsWith('/') ? `${baseUrl}${src}` : `${baseUrl}/${src}`;
+            } else if (src.startsWith('//')) {
+                resolvedSrc = `https:${src}`;
+            }
+
+            // Avoid duplicates
+            if (!images.some(img => img.src === resolvedSrc)) {
+                images.push({ src: resolvedSrc, alt: alt || '' });
+            }
+        };
+
        while ((match = regex.exec(html)) !== null) {
-            images.push({
-                src: match[1],
-                alt: match[2] || '',
-            });
+            addImage(match[1], match[2] || '');
+        }
+        while ((match = regex2.exec(html)) !== null) {
+            addImage(match[2], match[1] || '');
        }

-        return images.slice(0, 20); // Limit to 20 images
+        // Also check og:image
+        const ogImageMatch = html.match(/<meta[^>]*property=["']og:image["'][^>]*content=["']([^"']+)["']/i);
+        if (ogImageMatch) {
+            addImage(ogImageMatch[1], 'og-image');
+        }
+
+        return images.slice(0, 20);
+    }
+
+    /**
+     * Extract video links (YouTube, Vimeo, etc.) from HTML
+     */
+    private extractVideoLinks(html: string): string[] {
+        const videos: Set<string> = new Set();
+
+        // YouTube iframe embeds
+        const iframeRegex = /<iframe[^>]*src=["']([^"']*(?:youtube\.com|youtu\.be|vimeo\.com)[^"']*)["']/gi;
+        let match;
+        while ((match = iframeRegex.exec(html)) !== null) {
+            let url = match[1];
+            if (url.startsWith('//')) url = `https:${url}`;
+            // Convert embed URL to watch URL
+            url = url.replace('/embed/', '/watch?v=').replace('?feature=oembed', '');
+            videos.add(url);
+        }
+
+        // YouTube links in anchors
+        const anchorRegex = /<a[^>]*href=["']([^"']*(?:youtube\.com\/watch|youtu\.be\/)[^"']*)["']/gi;
+        while ((match = anchorRegex.exec(html)) !== null) {
+            videos.add(match[1]);
+        }
+
+        // data-video-url or data-src attributes
+        const dataRegex = /data-(?:video-url|src)=["']([^"']*(?:youtube\.com|youtu\.be|vimeo\.com)[^"']*)["']/gi;
+        while ((match = dataRegex.exec(html)) !== null) {
+            let url = match[1];
+            if (url.startsWith('//')) url = `https:${url}`;
+            videos.add(url);
+        }
+
+        return [...videos].slice(0, 10);
    }

    /**