main
Some checks failed
Backend Deploy 🚀 / build-and-deploy (push) Has been cancelled

This commit is contained in:
Harun CAN
2026-03-23 14:14:52 +03:00
parent 9bd2b4a2dd
commit c1e081478c
7 changed files with 1371 additions and 126 deletions

701
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -52,6 +52,7 @@
"pino": "^10.1.0",
"pino-http": "^11.0.0",
"prisma": "^5.22.0",
"puppeteer": "^24.40.0",
"reflect-metadata": "^0.2.2",
"rxjs": "^7.8.1",
"zod": "^4.3.5"

View File

@@ -15,6 +15,7 @@ import { SeoModule } from '../seo/seo.module';
import { NeuroMarketingModule } from '../neuro-marketing/neuro-marketing.module';
import { GeminiModule } from '../gemini/gemini.module';
import { VisualGenerationModule } from '../visual-generation/visual-generation.module';
import { WebScraperService } from '../trends/services/web-scraper.service';
@Module({
@@ -28,6 +29,7 @@ import { VisualGenerationModule } from '../visual-generation/visual-generation.m
HashtagService,
BrandVoiceService,
VariationService,
WebScraperService,
],
controllers: [ContentGenerationController],
exports: [ContentGenerationService],

View File

@@ -14,11 +14,13 @@ import { SeoService, FullSeoAnalysis as SeoDTO } from '../seo/seo.service';
import { NeuroMarketingService } from '../neuro-marketing/neuro-marketing.service';
import { StorageService } from '../visual-generation/services/storage.service';
import { VisualGenerationService } from '../visual-generation/visual-generation.service';
import { WebScraperService, ScrapedContent } from '../trends/services/web-scraper.service';
import { ContentType as PrismaContentType, ContentStatus as PrismaContentStatus, MasterContentType as PrismaMasterContentType } from '@prisma/client';
export interface ContentGenerationRequest {
topic: string;
sourceUrl?: string;
niche?: string;
platforms: Platform[];
includeResearch?: boolean;
@@ -76,6 +78,7 @@ export class ContentGenerationService {
private readonly neuroService: NeuroMarketingService,
private readonly storageService: StorageService,
private readonly visualService: VisualGenerationService,
private readonly webScraperService: WebScraperService,
) { }
@@ -87,6 +90,7 @@ export class ContentGenerationService {
async generateContent(request: ContentGenerationRequest): Promise<GeneratedContentBundle> {
const {
topic,
sourceUrl,
niche,
platforms,
includeResearch = true,
@@ -99,6 +103,26 @@ export class ContentGenerationService {
console.log(`[ContentGenerationService] Starting generation for topic: ${topic}, platforms: ${platforms.join(', ')}`);
// ========== STEP 1: Scrape source article if URL provided ==========
let scrapedSource: ScrapedContent | null = null;
if (sourceUrl) {
this.logger.log(`Scraping source article: ${sourceUrl}`);
try {
scrapedSource = await this.webScraperService.scrapeUrl(sourceUrl, {
extractImages: true,
extractLinks: true,
timeout: 15000,
}, topic);
if (scrapedSource) {
this.logger.log(`Scraped source: ${scrapedSource.wordCount} words, ${scrapedSource.images.length} images, ${scrapedSource.videoLinks.length} videos`);
} else {
this.logger.warn(`Failed to scrape source URL: ${sourceUrl}`);
}
} catch (err) {
this.logger.warn(`Source scraping error: ${err.message}`);
}
}
// Analyze niche if provided
let nicheAnalysis: NicheAnalysis | undefined;
if (niche) {
@@ -116,6 +140,23 @@ export class ContentGenerationService {
});
}
// ========== Build enriched context from scraped source ==========
let sourceContext = '';
if (scrapedSource) {
const articleText = scrapedSource.content.substring(0, 3000);
const videoInfo = scrapedSource.videoLinks.length > 0
? `\nVİDEO LİNKLERİ: ${scrapedSource.videoLinks.join(', ')}`
: '';
const importantLinks = scrapedSource.links
.filter(l => l.isExternal && !l.href.includes('facebook') && !l.href.includes('twitter'))
.slice(0, 5)
.map(l => `${l.text}: ${l.href}`)
.join('\n');
const linkInfo = importantLinks ? `\nÖNEMLİ LİNKLER:\n${importantLinks}` : '';
sourceContext = `\n\n📰 KAYNAK MAKALE İÇERİĞİ (ZORUNLU REFERANS):\n${articleText}${videoInfo}${linkInfo}\n\n⚠ ÖNEMLİ: Yukarıdaki kaynak makaledeki TÜM özneleri (kişi, ürün, oyun adları, tarihler, fiyatlar, markalar) habere dahil et. Hiçbir önemli bilgiyi atlama. Video linkleri ve önemli dış linkler varsa bunları da içerikte paylaş.`;
}
// Generate content for each platform using AI
const platformContent: GeneratedContent[] = [];
for (const platform of platforms) {
@@ -127,11 +168,13 @@ export class ContentGenerationService {
const sanitizedSummary = this.sanitizeResearchSummary(
research?.summary || `Everything you need to know about ${topic}`
);
// Append scraped source context to give AI the full article details
const enrichedSummary = sanitizedSummary + sourceContext;
// Normalize platform to lowercase for consistency
const normalizedPlatform = platform.toLowerCase();
const aiContent = await this.platformService.generateAIContent(
topic,
sanitizedSummary,
enrichedSummary,
normalizedPlatform as any, // Cast to any/Platform to resolve type mismatch if Platform is strict union
'standard',
'tr',
@@ -145,6 +188,9 @@ export class ContentGenerationService {
this.logger.warn(`AI Content is empty for ${platform}`);
}
// Use scraped image from source if available
const sourceImageUrl = scrapedSource?.images?.[0]?.src || undefined;
const config = this.platformService.getPlatformConfig(platform);
let content: GeneratedContent = {
platform,
@@ -163,10 +209,19 @@ export class ContentGenerationService {
content.content = voiceApplied.branded;
}
// Add hashtags if requested
// Add hashtags using AI (based on actual generated content)
if (includeHashtags) {
const hashtagSet = this.hashtagService.generateHashtags(topic, platform);
content.hashtags = hashtagSet.hashtags.map((h) => h.hashtag);
try {
content.hashtags = await this.platformService.generateAIHashtags(
content.content,
topic,
platform as any,
'tr',
);
} catch (hashErr) {
this.logger.warn(`AI hashtag generation failed, skipping: ${hashErr.message}`);
content.hashtags = [];
}
}
// Generate image for visual platforms
@@ -180,11 +235,31 @@ export class ContentGenerationService {
platform: platformKey,
enhancePrompt: true,
});
content.imageUrl = image.url;
this.logger.log(`Image generated for ${platform}: ${image.url}`);
// Check if image is a real image or just a placeholder
const isPlaceholder = image.url?.includes('placehold.co') || image.url?.includes('placeholder');
if (!isPlaceholder) {
content.imageUrl = image.url;
this.logger.log(`Image generated for ${platform}: ${image.url}`);
} else if (sourceImageUrl) {
// Use scraped source image instead of placeholder
content.imageUrl = sourceImageUrl;
this.logger.log(`Using scraped source image instead of placeholder: ${sourceImageUrl}`);
} else {
content.imageUrl = image.url;
this.logger.log(`Image generated for ${platform}: ${image.url} (placeholder, no source image available)`);
}
} catch (imgError) {
this.logger.warn(`Image generation failed for ${platform}, continuing without image`, imgError);
// Fallback to scraped source image
if (sourceImageUrl) {
content.imageUrl = sourceImageUrl;
this.logger.log(`Using scraped source image as fallback: ${sourceImageUrl}`);
}
}
} else if (sourceImageUrl && !content.imageUrl) {
// For non-visual platforms, still attach source image if available
content.imageUrl = sourceImageUrl;
}
platformContent.push(content);
@@ -358,7 +433,7 @@ export class ContentGenerationService {
userId: effectiveUserId!,
masterContentId: masterContent.id,
type: contentType,
title: `${bundle.topic} - ${platformContent.platform}`,
title: this.sanitizeResearchSummary(`${bundle.topic}`) + ` - ${platformContent.platform}`,
body: platformContent.content,
hashtags: platformContent.hashtags,
status: PrismaContentStatus.DRAFT,
@@ -548,6 +623,8 @@ KURALLAR:
6. Karakter limitini koru
7. Platformun tonuna uygun yaz
8. SADECE yayınlanacak metni yaz
9. Hiçbir haber sitesi, kaynak, ajans veya web sitesi adı kullanma
10. "...göre", "...haberlere göre", "...kaynağına göre" gibi atıf ifadeleri ASLA kullanma
SADECE yeniden yazılmış metni döndür, açıklama ekleme.`;
@@ -589,25 +666,43 @@ SADECE yeniden yazılmış metni döndür, açıklama ekleme.`;
sanitized = sanitized.replace(/https?:\/\/[^\s]+/gi, '');
sanitized = sanitized.replace(/www\.[^\s]+/gi, '');
// Remove common Turkish attribution phrases
// Remove common attribution phrases (Turkish and English)
const attributionPatterns = [
/\b\w+\.com(\.tr)?\b/gi,
/\b\w+\.org(\.tr)?\b/gi,
/\b\w+\.net(\.tr)?\b/gi,
/\bkaynağına göre\b/gi,
/\b'e göre\b/gi,
/\b'(i|a|e|u|ü|\u0131)n(da|de) (yayınlanan|yer alan|çıkan)\b/gi,
/\b(da|de) (çıkan|yayınlanan|yer alan) (haberlere|habere|bilgilere) göre\b/gi,
/\bhaberlere göre\b/gi,
/\braporuna göre\b/gi,
/\bsitesinde yer alan\b/gi,
/\baçıklamasına göre\b/gi,
/\byazısına göre\b/gi,
/\bhaberine göre\b/gi,
/\btarafından yapılan\b/gi,
/\baccording to [^,.]+/gi,
/\breported by [^,.]+/gi,
/\bas reported in [^,.]+/gi,
/\bsource:\s*[^,.]+/gi,
/\breferans:\s*[^,.]+/gi,
/\bkaynak:\s*[^,.]+/gi,
];
// Common Turkish tech/news source brands to strip
// Comprehensive list of Turkish tech/news source brands to strip
const sourceNames = [
'donanımhaber', 'technopat', 'webtekno', 'shiftdelete',
'tamindir', 'donanımhaber', 'technopat', 'webtekno', 'shiftdelete',
'chip online', 'log.com', 'mediatrend', 'bbc', 'cnn',
'reuters', 'anadolu ajansı', 'hürriyet', 'milliyet',
'sabah', 'forbes', 'bloomberg', 'techcrunch',
'the verge', 'engadget', 'ars technica', 'wired',
'mashable', 'gizmodo', 'tom\'s hardware', 'tom\'s guide',
'ntv', 'habertürk', 'sozcu', 'sözcü', 'cumhuriyet', 'star',
'posta', 'aksam', 'yeni safak', 'yeni şafak', 'takvim',
'mynet', 'ensonhaber', 'haber7', 'internethaber',
'ad hoc news', 'finanzen.net', 'der aktionär', 'aktionar',
'business insider', 'cnbc', 'financial times', 'wall street journal',
];
for (const pattern of attributionPatterns) {
@@ -615,12 +710,15 @@ SADECE yeniden yazılmış metni döndür, açıklama ekleme.`;
}
for (const source of sourceNames) {
const regex = new RegExp(`\\b${source}\\b`, 'gi');
const regex = new RegExp(`\\b${source.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, 'gi');
sanitized = sanitized.replace(regex, '');
}
// Clean up multiple spaces and trailing commas
sanitized = sanitized.replace(/\s{2,}/g, ' ').replace(/,\s*,/g, ',').trim();
// Also remove "- site_name" patterns from titles (e.g. "Great News - Tamindir")
sanitized = sanitized.replace(/\s*-\s*$/gm, '');
// Clean up multiple spaces, trailing commas, and orphaned punctuation
sanitized = sanitized.replace(/\s{2,}/g, ' ').replace(/,\s*,/g, ',').replace(/\s+([.,;:!?])/g, '$1').trim();
return sanitized;
}

View File

@@ -502,21 +502,39 @@ TON: ${config.tone}${styleInstruction}${ctaInstruction}
Bu platform için özgün, ilgi çekici ve viral potansiyeli yüksek bir içerik oluştur.
KURALLAR:
📈 SEO OPTİMİZASYONU (ZORUNLU):
- Bu konuyu Google'da, YouTube'da veya sosyal medyada arayan biri hangi kelimeleri kullanır? O kelimeleri belirle ve içeriğe yerleştir.
- İLK 2 CÜMLEDE arama hacmi en yüksek anahtar kelimeleri MUTLAKA kullan.
- Hook/giriş cümlesi birincil anahtar kelimeyi içersin.
- Anahtar kelimeleri doğal bir akış içinde kullan, zoraki tekrar yapma.
- Konu ile ilgili en çok aranan terimleri, teknik terimleri ve marka/ürün adlarını (haberin konusu olan markaları — kaynak değil) ön plana çıkar.
KRİTİK KURALLAR:
1. Karakter limitine uy
2. Platformun tonuna uygun yaz
3. Hook (dikkat çeken giriş) ile başla
4. CTA ile bitir (yukarıdaki CTA talimatına göre)
5. Emoji kullan ama aşırıya kaçma
6. ${language === 'tr' ? 'Türkçe' : 'İngilizce'} yaz
7. ASLA resim URL'i, medya linki veya [görsel] gibi yer tutucular ekleme
8. Görsel betimlemeleri metnin içine YAZMA
9. İçerik %100 özgün olmalı - asla kaynak kopyası yapma
10. Kaynak linklerini, URL'leri veya atıfları ASLA ekleme
11. Mevcut içeriklerden alıntı yapma, tamamen yeni ve orijinal yaz
12. Bilgiyi kendi cümlelerinle ifade et, paraphrase bile yapma
13. Araştırma kaynaklarının isimlerini (web siteleri, haber siteleri, markalar, gazeteler) ASLA metinde kullanma veya referans verme
14. "...göre", "...kaynağına göre", "according to" gibi atıf ifadeleri ASLA kullanma
7. İçerik %100 özgün olmalı - asla kaynak kopyası yapma
8. Bilgiyi kendi cümlelerinle ifade et, paraphrase bile yapma
⚠️ YAYIN HAZIR İÇERİK (ÇOK ÖNEMLİ):
- İçerik doğrudan kopyala-yapıştır ile yayınlanabilir olmalı
- "[Buraya Link]", "[Link Ekle]", "[URL]", "[Görsel]", "[Video]" gibi YER TUTUCU İFADELER ASLA kullanma
- Resim URL'i, medya linki veya placeholder ASLA ekleme
- Görsel betimlemeleri metnin içine YAZMA
- "Linke tıklayın", "Bio'daki linke gidin" gibi CTA'lar kullanabilirsin ama asla köşeli parantez içinde placeholder koyma
- Eğer bir link veya URL bilmiyorsan, o kısmı tamamen atla — placeholder bırakma
- İçerikte doldurulması gereken boşluk OLMAMALI
⛔ KAYNAK YASAĞI (EN ÖNEMLİ KURAL):
- Hiçbir haber sitesi, web sitesi, gazete, ajans, blog veya medya kuruluşu adını ASLA yazma
- "Tamindir", "Webtekno", "DonanımHaber", "ShiftDelete", "TechCrunch", "BBC", "CNN", "Reuters", "Forbes", "Bloomberg" gibi site/kaynak adlarını ASLA kullanma
- "...haberlere göre", "...raporuna göre", "...kaynağına göre", "...sitesinde yer alan", "...çıkan haberlere göre", "according to", "...tarafından yapılan" gibi ATıF İFADELERİ ASLA kullanma
- Haberin nereden alındığını BELİRTME, doğrudan bilgiyi kendi cümlelerinle anlat
- İçerikte kaynak gösterme, referans verme veya atıf yapma YOK
- Bilgi/veri paylaşırken kaynağı belirtmeden doğrudan bilgiyi ver
SADECE yayınlanacak metni yaz, açıklama veya başlık ekleme.`;
@@ -532,6 +550,107 @@ SADECE yayınlanacak metni yaz, açıklama veya başlık ekleme.`;
}
}
/**
* Generate relevant, SEO-optimized hashtags using AI
* Replaces the old mock-based hashtag generation
*/
async generateAIHashtags(
content: string,
topic: string,
platform: Platform,
language: string = 'tr',
): Promise<string[]> {
const config = this.platforms[platform];
if (!config || config.maxHashtags === 0) return [];
if (!this.gemini.isAvailable()) {
this.logger.warn('Gemini not available for hashtag generation, using fallback');
return this.generateFallbackHashtags(topic, config.maxHashtags);
}
const maxCount = Math.min(config.maxHashtags, platform === 'instagram' ? 15 : config.maxHashtags);
const prompt = `Sen bir sosyal medya SEO uzmanısın. Aşağıdaki içerik ve konu için ${platform.toUpperCase()} platformunda kullanılacak EN UYGUN ${maxCount} hashtag üret.
KONU: ${topic}
İÇERİK:
${content.substring(0, 500)}
HASHTAG KURALLARI:
1. Her hashtag DOĞRUDAN içerikle ilgili olmalı — genel veya ilişkisiz hashtag OLMASIN
2. Arama hacmi yüksek, gerçek kullanıcıların arayacağı kelimelerden oluştur
3. Konunun ana terimleri, teknik terimleri, marka/ürün adları (haberin konusu olanlar) ve sektör terimleri olsun
4. "tips", "howto", "life", "community", "motivation", "goals" gibi genel son ekler KULLANMA
5. ${language === 'tr' ? 'Türkçe ve İngilizce karışık olabilir, hangisi daha çok aranıyorsa onu seç' : 'Use English hashtags'}
6. Hashtag'ı # ile başlat
7. Tek kelime veya kısa bileşik kelimeler kullan (boşluk yok)
8. Haber kaynağı olan sitelerin adlarını (Webtekno, Tamindir, DonanımHaber, ShiftDelete, TechCrunch, BBC, CNN vb.) ASLA hashtag olarak kullanma — bunlar bizim kaynağımız, içeriğimiz değil
SADECE hashtag listesini döndür, her satırda bir hashtag. Başka açıklama ekleme.
ÖRNEK FORMAT:
#hashtag1
#hashtag2
#hashtag3`;
try {
const response = await this.gemini.generateText(prompt, {
temperature: 0.4,
maxTokens: 200,
});
// Banned source names that should never appear as hashtags
const bannedSources = [
'tamindir', 'webtekno', 'donanımhaber', 'donanımhaber', 'shiftdelete',
'technopat', 'chipsonline', 'chiponline', 'mediatrend', 'hürriyet',
'milliyet', 'sabah', 'ntv', 'habertürk', 'sözcü', 'sozcu',
'cumhuriyet', 'posta', 'aksam', 'takvim', 'mynet', 'ensonhaber',
'haber7', 'internethaber', 'bbc', 'cnn', 'reuters', 'forbes',
'bloomberg', 'techcrunch', 'theverge', 'engadget', 'wired',
'gizmodo', 'mashable', 'businessinsider', 'cnbc', 'adhoçnews',
'finanzennet', 'deraktionär', 'aktionar',
];
const hashtags = response.text
.split('\n')
.map(line => line.trim())
.filter(line => line.startsWith('#') && line.length > 1)
.map(tag => tag.replace(/\s+/g, ''))
.filter(tag => {
const cleanTag = tag.replace('#', '').toLowerCase();
return !bannedSources.some(source =>
cleanTag === source || cleanTag === source.replace(/\s/g, '')
);
})
.slice(0, maxCount);
if (hashtags.length === 0) {
this.logger.warn('AI returned no valid hashtags, using fallback');
return this.generateFallbackHashtags(topic, maxCount);
}
this.logger.log(`AI generated ${hashtags.length} hashtags for ${platform}: ${hashtags.join(', ')}`);
return hashtags;
} catch (error) {
this.logger.error(`AI hashtag generation failed: ${error.message}`);
return this.generateFallbackHashtags(topic, maxCount);
}
}
/**
* Fallback hashtag generation when AI is unavailable
* Extracts meaningful words from the topic instead of appending generic suffixes
*/
private generateFallbackHashtags(topic: string, maxCount: number): string[] {
const stopWords = new Set(['ve', 'ile', 'bir', 'bu', 'için', 'da', 'de', 'the', 'a', 'an', 'and', 'or', 'for', 'in', 'on', 'is', 'are', 'was', 'of', 'to']);
return topic
.toLowerCase()
.replace(/[^a-zçğıöşü\w\s]/gi, '')
.split(/\s+/)
.filter(w => w.length > 3 && !stopWords.has(w))
.slice(0, maxCount)
.map(w => `#${w}`);
}
private generateTemplateContent(
topic: string,
mainMessage: string,

View File

@@ -270,7 +270,7 @@ export class ContentOptimizationService {
if (!keyword) return 50;
const words = content.split(/\s+/).length;
const kwCount = (content.toLowerCase().match(new RegExp(keyword.toLowerCase(), 'g')) || []).length;
const kwCount = content.toLowerCase().split(keyword.toLowerCase()).length - 1;
const density = (kwCount / words) * 100;
if (density >= this.optimalParams.keywordDensity.min &&

View File

@@ -2,6 +2,7 @@
// Path: src/modules/trends/services/web-scraper.service.ts
import { Injectable, Logger } from '@nestjs/common';
import * as puppeteer from 'puppeteer';
export interface ScrapedContent {
url: string;
@@ -12,6 +13,7 @@ export interface ScrapedContent {
headings: { level: number; text: string }[];
links: { text: string; href: string; isExternal: boolean }[];
images: { src: string; alt: string }[];
videoLinks: string[];
metadata: {
author?: string;
publishDate?: string;
@@ -63,37 +65,362 @@ export interface ScraperOptions {
export class WebScraperService {
private readonly logger = new Logger(WebScraperService.name);
private readonly contentCache = new Map<string, ScrapedContent>();
private readonly defaultUserAgent = 'ContentHunter/1.0 (Research Bot)';
private readonly defaultUserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
/**
* Scrape content from a web page
* Scrape content from a web page.
* Automatically resolves Google News redirect URLs.
* @param url The URL to scrape
* @param options Scraper options
* @param articleTitle The article title (used for Google News URL resolution via search)
*/
async scrapeUrl(url: string, options?: ScraperOptions): Promise<ScrapedContent | null> {
async scrapeUrl(url: string, options?: ScraperOptions, articleTitle?: string): Promise<ScrapedContent | null> {
// Validate URL
if (!this.isValidUrl(url)) {
this.logger.warn(`Invalid URL: ${url}`);
return null;
}
// Resolve Google News redirect URLs to actual article URLs
let resolvedUrl = url;
if (url.includes('news.google.com') || url.includes('google.com/rss')) {
this.logger.log(`Detected Google News URL: ${url}`);
// Strategy 1: Use Puppeteer headless browser to follow JS redirects (most reliable)
const puppeteerResult = await this.resolveGoogleNewsWithPuppeteer(url);
if (puppeteerResult) {
resolvedUrl = puppeteerResult;
this.logger.log(`Puppeteer resolved Google News URL to: ${resolvedUrl}`);
} else {
// Strategy 2: Fall back to DuckDuckGo title search
this.logger.warn('Puppeteer resolution failed, trying DuckDuckGo title search...');
if (articleTitle) {
const searchResult = await this.findArticleByTitle(articleTitle);
if (searchResult) {
resolvedUrl = searchResult;
this.logger.log(`DuckDuckGo found article URL: ${resolvedUrl}`);
} else {
this.logger.warn('Both Puppeteer and DuckDuckGo failed. Cannot resolve Google News URL.');
return null;
}
} else {
this.logger.warn('No article title provided for fallback search. Cannot resolve Google News URL.');
return null;
}
}
}
// Check cache
const cached = this.contentCache.get(url);
const cached = this.contentCache.get(resolvedUrl);
if (cached && this.isCacheValid(cached)) {
return cached;
}
try {
const response = await this.fetchPage(url, options);
if (!response) return null;
this.logger.log(`Scraping URL: ${resolvedUrl}`);
const response = await this.fetchPage(resolvedUrl, options);
if (!response) {
this.logger.warn(`fetchPage returned null for ${resolvedUrl}`);
return null;
}
const content = this.parseHtml(response.html, url, options);
this.logger.log(`Fetched HTML: ${response.html.length} chars`);
const content = this.parseHtml(response.html, resolvedUrl, options);
content.html = options?.includeHtml ? response.html : '';
// Cache the result
this.contentCache.set(url, content);
this.contentCache.set(resolvedUrl, content);
this.logger.log(`Scraped successfully: ${content.title}, ${content.images.length} images, ${content.videoLinks.length} videos, ${content.wordCount} words`);
return content;
} catch (error) {
this.logger.error(`Failed to scrape ${url}:`, error);
this.logger.error(`Failed to scrape ${resolvedUrl}:`, error);
return null;
}
}
/**
* Resolve a Google News redirect URL to the actual article URL using Puppeteer.
* Google News uses JavaScript-only redirects that cannot be followed via HTTP.
* Puppeteer launches a headless browser to follow the redirect.
*/
private async resolveGoogleNewsWithPuppeteer(googleNewsUrl: string): Promise<string | null> {
let browser: puppeteer.Browser | null = null;
try {
this.logger.log(`Launching Puppeteer to resolve Google News URL...`);
browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-extensions',
'--disable-background-networking',
'--window-size=1280,720',
],
timeout: 20000,
});
const page = await browser.newPage();
// Set a realistic user agent
await page.setUserAgent(this.defaultUserAgent);
// Block unnecessary resources to speed up loading
await page.setRequestInterception(true);
page.on('request', (request) => {
const resourceType = request.resourceType();
if (['image', 'stylesheet', 'font', 'media'].includes(resourceType)) {
request.abort();
} else {
request.continue();
}
});
// Navigate to the Google News URL
await page.goto(googleNewsUrl, {
waitUntil: 'networkidle2',
timeout: 15000,
});
// Wait for the redirect to complete — check periodically if URL changed
let finalUrl = page.url();
const startTime = Date.now();
const maxWait = 10000; // 10 seconds max
while (finalUrl.includes('news.google.com') && (Date.now() - startTime) < maxWait) {
await new Promise(resolve => setTimeout(resolve, 500));
finalUrl = page.url();
}
await browser.close();
browser = null;
// Check if we successfully left Google News
if (!finalUrl.includes('news.google.com') && !finalUrl.includes('consent.google.com')) {
this.logger.log(`Puppeteer resolved to: ${finalUrl}`);
return finalUrl;
} else {
this.logger.warn(`Puppeteer could not resolve - still on Google domain: ${finalUrl}`);
return null;
}
} catch (error) {
this.logger.warn(`Puppeteer resolution failed: ${error.message}`);
return null;
} finally {
if (browser) {
try { await browser.close(); } catch (e) { /* ignore */ }
}
}
}
/**
* Find actual article URL by searching for the article title.
* Extracts the source name from the title (e.g., "Title - Webtekno" → searches "site:webtekno.com Title")
* Uses DuckDuckGo HTML search (more bot-friendly than Google)
*/
private async findArticleByTitle(title: string): Promise<string | null> {
try {
// Extract source name from title (usually at the end after " - ")
const parts = title.split(/\s+-\s+/);
const sourceName = parts.length > 1 ? parts[parts.length - 1].trim() : '';
const cleanTitle = parts.length > 1 ? parts.slice(0, -1).join(' - ').trim() : title;
// Remove brackets and special chars from title for better search
const searchableTitle = cleanTitle
.replace(/\[.*?\]/g, '')
.replace(/[^\w\s\u00C0-\u024F\u0100-\u017F\u011E-\u011F\u0130-\u0131\u015E-\u015F\u00D6\u00F6\u00DC\u00FC\u00C7\u00E7]/g, ' ')
.replace(/\s+/g, ' ')
.trim();
// Build search query — prefer site: filter if we know the source
const sourceDomain = sourceName.toLowerCase().replace(/\s+/g, '');
const siteFilter = sourceName ? `${sourceDomain}.com ` : '';
const searchQuery = `${siteFilter}${searchableTitle}`;
this.logger.log(`Searching DuckDuckGo for article: ${searchQuery}`);
// Use DuckDuckGo HTML search (more bot-friendly)
const searchUrl = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(searchQuery)}`;
const response = await fetch(searchUrl, {
headers: {
'User-Agent': this.defaultUserAgent,
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
},
});
if (!response.ok) {
this.logger.warn(`DuckDuckGo search returned ${response.status}`);
return null;
}
const html = await response.text();
// DuckDuckGo HTML results contain article URLs in uddg= parameters
const ddgPattern = /uddg=(https?[^&"]+)/g;
const foundUrls: string[] = [];
const seen = new Set<string>();
let match;
while ((match = ddgPattern.exec(html)) !== null) {
const foundUrl = decodeURIComponent(match[1]);
if (!seen.has(foundUrl) && !foundUrl.includes('duckduckgo.com') && !foundUrl.includes('google.com')) {
seen.add(foundUrl);
foundUrls.push(foundUrl);
}
}
if (foundUrls.length > 0) {
// Prefer URLs matching the source name
if (sourceName) {
const sourceUrl = foundUrls.find(u => u.toLowerCase().includes(sourceDomain));
if (sourceUrl) {
this.logger.log(`Found matching source URL via DuckDuckGo: ${sourceUrl}`);
return sourceUrl;
}
}
this.logger.log(`Using first DuckDuckGo result: ${foundUrls[0]}`);
return foundUrls[0];
}
this.logger.warn('No search results found for article title');
return null;
} catch (error) {
this.logger.warn(`Article search failed: ${error.message}`);
return null;
}
}
/**
* Resolve redirect URLs (especially Google News) to the final destination URL.
* Google News RSS URLs encode the actual article URL in a base64 segment in the path.
*/
private async resolveRedirectUrl(url: string): Promise<string | null> {
// Strategy 1: Decode Google News base64-encoded URL from the path
try {
const decoded = this.decodeGoogleNewsUrl(url);
if (decoded) {
this.logger.log(`Decoded Google News URL: ${decoded}`);
return decoded;
}
} catch (e) {
this.logger.warn(`Base64 decode failed: ${e.message}`);
}
// Strategy 2: Follow HTTP redirects
try {
// First try with redirect: 'manual' to get Location header
const headResponse = await fetch(url, {
method: 'HEAD',
headers: {
'User-Agent': this.defaultUserAgent,
'Accept': 'text/html',
},
redirect: 'manual',
});
const locationHeader = headResponse.headers.get('location');
if (locationHeader && !locationHeader.includes('news.google.com')) {
this.logger.log(`Redirect Location header: ${locationHeader}`);
return locationHeader;
}
// Try full GET with redirect follow
const getResponse = await fetch(url, {
method: 'GET',
headers: {
'User-Agent': this.defaultUserAgent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
},
redirect: 'follow',
});
if (getResponse.url && !getResponse.url.includes('news.google.com')) {
return getResponse.url;
}
// Strategy 3: Parse HTML for article link / meta refresh / canonical
const html = await getResponse.text();
// Check data-redirect attribute
const dataRedirect = html.match(/data-(?:redirect|href|url)=["'](https?:\/\/(?!news\.google\.com)[^"']+)["']/i);
if (dataRedirect) return dataRedirect[1];
// Meta refresh
const metaRefresh = html.match(/<meta[^>]*http-equiv=["']refresh["'][^>]*content=["']\d+;\s*url=([^"']+)["']/i);
if (metaRefresh) return metaRefresh[1];
// Canonical
const canonical = html.match(/<link[^>]*rel=["']canonical["'][^>]*href=["']([^"']+)["']/i);
if (canonical && !canonical[1].includes('news.google.com')) return canonical[1];
// Any external link that looks like an article
const externalLink = html.match(/href=["'](https?:\/\/(?!(?:news|www)\.google\.com)[^"']+)["']/i);
if (externalLink) return externalLink[1];
this.logger.warn(`Could not resolve Google News URL, returning original`);
return getResponse.url;
} catch (error) {
this.logger.warn(`Failed to resolve redirect for ${url}: ${error.message}`);
return null;
}
}
/**
* Decode actual article URL from Google News RSS URL.
* Google News encodes URLs in the path segment as base64.
* Format: https://news.google.com/rss/articles/CBMi{base64payload}
*/
private decodeGoogleNewsUrl(googleUrl: string): string | null {
try {
// Extract the base64 part from the URL path
const urlObj = new URL(googleUrl);
const pathParts = urlObj.pathname.split('/');
// Find the article ID part (after /articles/)
const articlesIndex = pathParts.indexOf('articles');
if (articlesIndex === -1 || articlesIndex + 1 >= pathParts.length) {
return null;
}
let articleId = pathParts[articlesIndex + 1];
// Remove query params if they got attached
if (articleId.includes('?')) {
articleId = articleId.split('?')[0];
}
// The article ID starts with "CBMi" prefix, try to decode the base64
// Make base64 URL-safe: replace - with + and _ with /
let base64 = articleId
.replace(/-/g, '+')
.replace(/_/g, '/');
// Add padding if needed
while (base64.length % 4 !== 0) {
base64 += '=';
}
// Decode base64
const decoded = Buffer.from(base64, 'base64').toString('utf-8');
// Extract URLs from the decoded string using regex
const urlMatches = decoded.match(/https?:\/\/[^\s"'<>\x00-\x1F]+/g);
if (urlMatches && urlMatches.length > 0) {
// Filter out Google URLs
const nonGoogleUrl = urlMatches.find(u => !u.includes('google.com'));
if (nonGoogleUrl) {
// Clean up any trailing garbage characters
const cleanUrl = nonGoogleUrl.replace(/[\x00-\x1F\x7F-\x9F]+.*$/, '');
this.logger.log(`Decoded article URL from base64: ${cleanUrl}`);
return cleanUrl;
}
return urlMatches[0];
}
return null;
} catch (error) {
this.logger.warn(`Failed to decode Google News URL: ${error.message}`);
return null;
}
}
@@ -129,64 +456,57 @@ export class WebScraperService {
}
/**
* Fetch page content (simulated)
* Fetch page content using real HTTP fetch
*/
private async fetchPage(url: string, options?: ScraperOptions): Promise<{ html: string } | null> {
// In production, use:
// 1. node-fetch or axios for simple pages
// 2. Puppeteer/Playwright for JavaScript-rendered pages
// 3. Cheerio for HTML parsing
const timeout = options?.timeout || 15000;
const userAgent = options?.userAgent || this.defaultUserAgent;
// Simulated HTML for demonstration
const mockHtml = `
<!DOCTYPE html>
<html>
<head>
<title>Sample Article: Content Creation Strategies</title>
<meta name="description" content="Learn the best content creation strategies for 2024">
<meta name="author" content="John Doe">
<meta name="keywords" content="content, creation, marketing, strategy">
<meta property="og:title" content="Content Creation Strategies">
<meta property="og:description" content="Master content creation with these proven strategies">
<meta property="og:image" content="https://example.com/image.jpg">
</head>
<body>
<article>
<h1>10 Content Creation Strategies for 2024</h1>
<p class="author">By John Doe | Published: January 15, 2024</p>
try {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeout);
<h2>Introduction</h2>
<p>Content creation has evolved significantly over the past year. In this comprehensive guide, we'll explore the most effective strategies for creating engaging content.</p>
const response = await fetch(url, {
headers: {
'User-Agent': userAgent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
'Accept-Encoding': 'identity',
},
signal: controller.signal,
redirect: 'follow',
});
<h2>1. Focus on Value First</h2>
<p>The most successful content creators prioritize providing value to their audience. According to a recent study, 78% of consumers prefer brands that create custom content.</p>
clearTimeout(timeoutId);
<h2>2. Embrace Short-Form Video</h2>
<p>Short-form video continues to dominate. TikTok and Instagram Reels have shown that 15-60 second videos can generate massive engagement.</p>
if (!response.ok) {
this.logger.warn(`HTTP ${response.status} for ${url}`);
return null;
}
<blockquote>"Content is king, but distribution is queen." - Gary Vaynerchuk</blockquote>
const contentType = response.headers.get('content-type') || '';
if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
this.logger.warn(`Non-HTML content type: ${contentType} for ${url}`);
return null;
}
<h2>3. Use AI Wisely</h2>
<p>AI tools like ChatGPT and Claude can help with ideation and drafting, but human creativity remains essential for authentic content.</p>
const html = await response.text();
<h3>Key Statistics</h3>
<ul>
<li>85% of marketers use content marketing</li>
<li>Video content generates 1200% more shares</li>
<li>Long-form content gets 77% more backlinks</li>
</ul>
if (!html || html.length < 100) {
this.logger.warn(`Empty or very short response from ${url}`);
return null;
}
<h2>Conclusion</h2>
<p>Success in content creation requires a balance of strategy, creativity, and consistency. Start implementing these strategies today!</p>
<a href="/related-article">Read more articles</a>
<a href="https://external.com/resource">External resource</a>
</article>
</body>
</html>
`;
return { html: mockHtml };
this.logger.log(`Successfully fetched ${url} (${html.length} chars)`);
return { html };
} catch (error) {
if (error.name === 'AbortError') {
this.logger.warn(`Request timed out for ${url}`);
} else {
this.logger.error(`Failed to fetch ${url}: ${error.message}`);
}
return null;
}
}
/**
@@ -194,6 +514,7 @@ export class WebScraperService {
*/
private parseHtml(html: string, url: string, options?: ScraperOptions): ScrapedContent {
const domain = new URL(url).hostname;
const baseUrl = new URL(url).origin;
// Extract title
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
@@ -212,8 +533,11 @@ export class WebScraperService {
// Extract links
const links = options?.extractLinks !== false ? this.extractLinks(html, domain) : [];
// Extract images
const images = options?.extractImages !== false ? this.extractImages(html) : [];
// Extract images with absolute URL resolution
const images = options?.extractImages !== false ? this.extractImages(html, baseUrl) : [];
// Extract video links (YouTube, etc.)
const videoLinks = this.extractVideoLinks(html);
// Extract metadata
const metadata = this.extractMetadata(html);
@@ -232,6 +556,7 @@ export class WebScraperService {
headings,
links,
images,
videoLinks,
metadata,
wordCount,
readingTime,
@@ -306,21 +631,82 @@ export class WebScraperService {
}
/**
* Extract images from HTML
* Extract images from HTML with absolute URL resolution
*/
private extractImages(html: string): { src: string; alt: string }[] {
private extractImages(html: string, baseUrl?: string): { src: string; alt: string }[] {
const images: { src: string; alt: string }[] = [];
// Match src before alt, or alt before src
const regex = /<img[^>]*src=["']([^"']+)["'][^>]*(?:alt=["']([^"']*)["'])?/gi;
const regex2 = /<img[^>]*alt=["']([^"']*)["'][^>]*src=["']([^"']+)["']/gi;
let match;
const addImage = (src: string, alt: string) => {
// Skip tiny tracking pixels, icons, and data URIs
if (src.includes('1x1') || src.includes('pixel') || src.includes('data:image/gif')) return;
if (src.endsWith('.svg') || src.endsWith('.ico')) return;
// Resolve relative URLs
let resolvedSrc = src;
if (baseUrl && !src.startsWith('http') && !src.startsWith('//')) {
resolvedSrc = src.startsWith('/') ? `${baseUrl}${src}` : `${baseUrl}/${src}`;
} else if (src.startsWith('//')) {
resolvedSrc = `https:${src}`;
}
// Avoid duplicates
if (!images.some(img => img.src === resolvedSrc)) {
images.push({ src: resolvedSrc, alt: alt || '' });
}
};
while ((match = regex.exec(html)) !== null) {
images.push({
src: match[1],
alt: match[2] || '',
});
addImage(match[1], match[2] || '');
}
while ((match = regex2.exec(html)) !== null) {
addImage(match[2], match[1] || '');
}
return images.slice(0, 20); // Limit to 20 images
// Also check og:image
const ogImageMatch = html.match(/<meta[^>]*property=["']og:image["'][^>]*content=["']([^"']+)["']/i);
if (ogImageMatch) {
addImage(ogImageMatch[1], 'og-image');
}
return images.slice(0, 20);
}
/**
* Extract video links (YouTube, Vimeo, etc.) from HTML
*/
private extractVideoLinks(html: string): string[] {
const videos: Set<string> = new Set();
// YouTube iframe embeds
const iframeRegex = /<iframe[^>]*src=["']([^"']*(?:youtube\.com|youtu\.be|vimeo\.com)[^"']*)["']/gi;
let match;
while ((match = iframeRegex.exec(html)) !== null) {
let url = match[1];
if (url.startsWith('//')) url = `https:${url}`;
// Convert embed URL to watch URL
url = url.replace('/embed/', '/watch?v=').replace('?feature=oembed', '');
videos.add(url);
}
// YouTube links in anchors
const anchorRegex = /<a[^>]*href=["']([^"']*(?:youtube\.com\/watch|youtu\.be\/)[^"']*)["']/gi;
while ((match = anchorRegex.exec(html)) !== null) {
videos.add(match[1]);
}
// data-video-url or data-src attributes
const dataRegex = /data-(?:video-url|src)=["']([^"']*(?:youtube\.com|youtu\.be|vimeo\.com)[^"']*)["']/gi;
while ((match = dataRegex.exec(html)) !== null) {
let url = match[1];
if (url.startsWith('//')) url = `https:${url}`;
videos.add(url);
}
return [...videos].slice(0, 10);
}
/**