generated from fahricansecer/boilerplate-be
This commit is contained in:
701
package-lock.json
generated
701
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -52,6 +52,7 @@
|
||||
"pino": "^10.1.0",
|
||||
"pino-http": "^11.0.0",
|
||||
"prisma": "^5.22.0",
|
||||
"puppeteer": "^24.40.0",
|
||||
"reflect-metadata": "^0.2.2",
|
||||
"rxjs": "^7.8.1",
|
||||
"zod": "^4.3.5"
|
||||
|
||||
@@ -15,6 +15,7 @@ import { SeoModule } from '../seo/seo.module';
|
||||
import { NeuroMarketingModule } from '../neuro-marketing/neuro-marketing.module';
|
||||
import { GeminiModule } from '../gemini/gemini.module';
|
||||
import { VisualGenerationModule } from '../visual-generation/visual-generation.module';
|
||||
import { WebScraperService } from '../trends/services/web-scraper.service';
|
||||
|
||||
|
||||
@Module({
|
||||
@@ -28,6 +29,7 @@ import { VisualGenerationModule } from '../visual-generation/visual-generation.m
|
||||
HashtagService,
|
||||
BrandVoiceService,
|
||||
VariationService,
|
||||
WebScraperService,
|
||||
],
|
||||
controllers: [ContentGenerationController],
|
||||
exports: [ContentGenerationService],
|
||||
|
||||
@@ -14,11 +14,13 @@ import { SeoService, FullSeoAnalysis as SeoDTO } from '../seo/seo.service';
|
||||
import { NeuroMarketingService } from '../neuro-marketing/neuro-marketing.service';
|
||||
import { StorageService } from '../visual-generation/services/storage.service';
|
||||
import { VisualGenerationService } from '../visual-generation/visual-generation.service';
|
||||
import { WebScraperService, ScrapedContent } from '../trends/services/web-scraper.service';
|
||||
import { ContentType as PrismaContentType, ContentStatus as PrismaContentStatus, MasterContentType as PrismaMasterContentType } from '@prisma/client';
|
||||
|
||||
|
||||
export interface ContentGenerationRequest {
|
||||
topic: string;
|
||||
sourceUrl?: string;
|
||||
niche?: string;
|
||||
platforms: Platform[];
|
||||
includeResearch?: boolean;
|
||||
@@ -76,6 +78,7 @@ export class ContentGenerationService {
|
||||
private readonly neuroService: NeuroMarketingService,
|
||||
private readonly storageService: StorageService,
|
||||
private readonly visualService: VisualGenerationService,
|
||||
private readonly webScraperService: WebScraperService,
|
||||
) { }
|
||||
|
||||
|
||||
@@ -87,6 +90,7 @@ export class ContentGenerationService {
|
||||
async generateContent(request: ContentGenerationRequest): Promise<GeneratedContentBundle> {
|
||||
const {
|
||||
topic,
|
||||
sourceUrl,
|
||||
niche,
|
||||
platforms,
|
||||
includeResearch = true,
|
||||
@@ -99,6 +103,26 @@ export class ContentGenerationService {
|
||||
|
||||
console.log(`[ContentGenerationService] Starting generation for topic: ${topic}, platforms: ${platforms.join(', ')}`);
|
||||
|
||||
// ========== STEP 1: Scrape source article if URL provided ==========
|
||||
let scrapedSource: ScrapedContent | null = null;
|
||||
if (sourceUrl) {
|
||||
this.logger.log(`Scraping source article: ${sourceUrl}`);
|
||||
try {
|
||||
scrapedSource = await this.webScraperService.scrapeUrl(sourceUrl, {
|
||||
extractImages: true,
|
||||
extractLinks: true,
|
||||
timeout: 15000,
|
||||
}, topic);
|
||||
if (scrapedSource) {
|
||||
this.logger.log(`Scraped source: ${scrapedSource.wordCount} words, ${scrapedSource.images.length} images, ${scrapedSource.videoLinks.length} videos`);
|
||||
} else {
|
||||
this.logger.warn(`Failed to scrape source URL: ${sourceUrl}`);
|
||||
}
|
||||
} catch (err) {
|
||||
this.logger.warn(`Source scraping error: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Analyze niche if provided
|
||||
let nicheAnalysis: NicheAnalysis | undefined;
|
||||
if (niche) {
|
||||
@@ -116,6 +140,23 @@ export class ContentGenerationService {
|
||||
});
|
||||
}
|
||||
|
||||
// ========== Build enriched context from scraped source ==========
|
||||
let sourceContext = '';
|
||||
if (scrapedSource) {
|
||||
const articleText = scrapedSource.content.substring(0, 3000);
|
||||
const videoInfo = scrapedSource.videoLinks.length > 0
|
||||
? `\nVİDEO LİNKLERİ: ${scrapedSource.videoLinks.join(', ')}`
|
||||
: '';
|
||||
const importantLinks = scrapedSource.links
|
||||
.filter(l => l.isExternal && !l.href.includes('facebook') && !l.href.includes('twitter'))
|
||||
.slice(0, 5)
|
||||
.map(l => `${l.text}: ${l.href}`)
|
||||
.join('\n');
|
||||
const linkInfo = importantLinks ? `\nÖNEMLİ LİNKLER:\n${importantLinks}` : '';
|
||||
|
||||
sourceContext = `\n\n📰 KAYNAK MAKALE İÇERİĞİ (ZORUNLU REFERANS):\n${articleText}${videoInfo}${linkInfo}\n\n⚠️ ÖNEMLİ: Yukarıdaki kaynak makaledeki TÜM özneleri (kişi, ürün, oyun adları, tarihler, fiyatlar, markalar) habere dahil et. Hiçbir önemli bilgiyi atlama. Video linkleri ve önemli dış linkler varsa bunları da içerikte paylaş.`;
|
||||
}
|
||||
|
||||
// Generate content for each platform using AI
|
||||
const platformContent: GeneratedContent[] = [];
|
||||
for (const platform of platforms) {
|
||||
@@ -127,11 +168,13 @@ export class ContentGenerationService {
|
||||
const sanitizedSummary = this.sanitizeResearchSummary(
|
||||
research?.summary || `Everything you need to know about ${topic}`
|
||||
);
|
||||
// Append scraped source context to give AI the full article details
|
||||
const enrichedSummary = sanitizedSummary + sourceContext;
|
||||
// Normalize platform to lowercase for consistency
|
||||
const normalizedPlatform = platform.toLowerCase();
|
||||
const aiContent = await this.platformService.generateAIContent(
|
||||
topic,
|
||||
sanitizedSummary,
|
||||
enrichedSummary,
|
||||
normalizedPlatform as any, // Cast to any/Platform to resolve type mismatch if Platform is strict union
|
||||
'standard',
|
||||
'tr',
|
||||
@@ -145,6 +188,9 @@ export class ContentGenerationService {
|
||||
this.logger.warn(`AI Content is empty for ${platform}`);
|
||||
}
|
||||
|
||||
// Use scraped image from source if available
|
||||
const sourceImageUrl = scrapedSource?.images?.[0]?.src || undefined;
|
||||
|
||||
const config = this.platformService.getPlatformConfig(platform);
|
||||
let content: GeneratedContent = {
|
||||
platform,
|
||||
@@ -163,10 +209,19 @@ export class ContentGenerationService {
|
||||
content.content = voiceApplied.branded;
|
||||
}
|
||||
|
||||
// Add hashtags if requested
|
||||
// Add hashtags using AI (based on actual generated content)
|
||||
if (includeHashtags) {
|
||||
const hashtagSet = this.hashtagService.generateHashtags(topic, platform);
|
||||
content.hashtags = hashtagSet.hashtags.map((h) => h.hashtag);
|
||||
try {
|
||||
content.hashtags = await this.platformService.generateAIHashtags(
|
||||
content.content,
|
||||
topic,
|
||||
platform as any,
|
||||
'tr',
|
||||
);
|
||||
} catch (hashErr) {
|
||||
this.logger.warn(`AI hashtag generation failed, skipping: ${hashErr.message}`);
|
||||
content.hashtags = [];
|
||||
}
|
||||
}
|
||||
|
||||
// Generate image for visual platforms
|
||||
@@ -180,12 +235,32 @@ export class ContentGenerationService {
|
||||
platform: platformKey,
|
||||
enhancePrompt: true,
|
||||
});
|
||||
|
||||
// Check if image is a real image or just a placeholder
|
||||
const isPlaceholder = image.url?.includes('placehold.co') || image.url?.includes('placeholder');
|
||||
if (!isPlaceholder) {
|
||||
content.imageUrl = image.url;
|
||||
this.logger.log(`Image generated for ${platform}: ${image.url}`);
|
||||
} else if (sourceImageUrl) {
|
||||
// Use scraped source image instead of placeholder
|
||||
content.imageUrl = sourceImageUrl;
|
||||
this.logger.log(`Using scraped source image instead of placeholder: ${sourceImageUrl}`);
|
||||
} else {
|
||||
content.imageUrl = image.url;
|
||||
this.logger.log(`Image generated for ${platform}: ${image.url} (placeholder, no source image available)`);
|
||||
}
|
||||
} catch (imgError) {
|
||||
this.logger.warn(`Image generation failed for ${platform}, continuing without image`, imgError);
|
||||
// Fallback to scraped source image
|
||||
if (sourceImageUrl) {
|
||||
content.imageUrl = sourceImageUrl;
|
||||
this.logger.log(`Using scraped source image as fallback: ${sourceImageUrl}`);
|
||||
}
|
||||
}
|
||||
} else if (sourceImageUrl && !content.imageUrl) {
|
||||
// For non-visual platforms, still attach source image if available
|
||||
content.imageUrl = sourceImageUrl;
|
||||
}
|
||||
|
||||
platformContent.push(content);
|
||||
} catch (error) {
|
||||
@@ -358,7 +433,7 @@ export class ContentGenerationService {
|
||||
userId: effectiveUserId!,
|
||||
masterContentId: masterContent.id,
|
||||
type: contentType,
|
||||
title: `${bundle.topic} - ${platformContent.platform}`,
|
||||
title: this.sanitizeResearchSummary(`${bundle.topic}`) + ` - ${platformContent.platform}`,
|
||||
body: platformContent.content,
|
||||
hashtags: platformContent.hashtags,
|
||||
status: PrismaContentStatus.DRAFT,
|
||||
@@ -548,6 +623,8 @@ KURALLAR:
|
||||
6. Karakter limitini koru
|
||||
7. Platformun tonuna uygun yaz
|
||||
8. SADECE yayınlanacak metni yaz
|
||||
9. Hiçbir haber sitesi, kaynak, ajans veya web sitesi adı kullanma
|
||||
10. "...göre", "...haberlere göre", "...kaynağına göre" gibi atıf ifadeleri ASLA kullanma
|
||||
|
||||
SADECE yeniden yazılmış metni döndür, açıklama ekleme.`;
|
||||
|
||||
@@ -589,25 +666,43 @@ SADECE yeniden yazılmış metni döndür, açıklama ekleme.`;
|
||||
sanitized = sanitized.replace(/https?:\/\/[^\s]+/gi, '');
|
||||
sanitized = sanitized.replace(/www\.[^\s]+/gi, '');
|
||||
|
||||
// Remove common Turkish attribution phrases
|
||||
// Remove common attribution phrases (Turkish and English)
|
||||
const attributionPatterns = [
|
||||
/\b\w+\.com(\.tr)?\b/gi,
|
||||
/\b\w+\.org(\.tr)?\b/gi,
|
||||
/\b\w+\.net(\.tr)?\b/gi,
|
||||
/\bkaynağına göre\b/gi,
|
||||
/\b'e göre\b/gi,
|
||||
/\b'(i|a|e|u|ü|\u0131)n(da|de) (yayınlanan|yer alan|çıkan)\b/gi,
|
||||
/\b(da|de) (çıkan|yayınlanan|yer alan) (haberlere|habere|bilgilere) göre\b/gi,
|
||||
/\bhaberlere göre\b/gi,
|
||||
/\braporuna göre\b/gi,
|
||||
/\bsitesinde yer alan\b/gi,
|
||||
/\baçıklamasına göre\b/gi,
|
||||
/\byazısına göre\b/gi,
|
||||
/\bhaberine göre\b/gi,
|
||||
/\btarafından yapılan\b/gi,
|
||||
/\baccording to [^,.]+/gi,
|
||||
/\breported by [^,.]+/gi,
|
||||
/\bas reported in [^,.]+/gi,
|
||||
/\bsource:\s*[^,.]+/gi,
|
||||
/\breferans:\s*[^,.]+/gi,
|
||||
/\bkaynak:\s*[^,.]+/gi,
|
||||
];
|
||||
|
||||
// Common Turkish tech/news source brands to strip
|
||||
// Comprehensive list of Turkish tech/news source brands to strip
|
||||
const sourceNames = [
|
||||
'donanımhaber', 'technopat', 'webtekno', 'shiftdelete',
|
||||
'tamindir', 'donanımhaber', 'technopat', 'webtekno', 'shiftdelete',
|
||||
'chip online', 'log.com', 'mediatrend', 'bbc', 'cnn',
|
||||
'reuters', 'anadolu ajansı', 'hürriyet', 'milliyet',
|
||||
'sabah', 'forbes', 'bloomberg', 'techcrunch',
|
||||
'the verge', 'engadget', 'ars technica', 'wired',
|
||||
'mashable', 'gizmodo', 'tom\'s hardware', 'tom\'s guide',
|
||||
'ntv', 'habertürk', 'sozcu', 'sözcü', 'cumhuriyet', 'star',
|
||||
'posta', 'aksam', 'yeni safak', 'yeni şafak', 'takvim',
|
||||
'mynet', 'ensonhaber', 'haber7', 'internethaber',
|
||||
'ad hoc news', 'finanzen.net', 'der aktionär', 'aktionar',
|
||||
'business insider', 'cnbc', 'financial times', 'wall street journal',
|
||||
];
|
||||
|
||||
for (const pattern of attributionPatterns) {
|
||||
@@ -615,12 +710,15 @@ SADECE yeniden yazılmış metni döndür, açıklama ekleme.`;
|
||||
}
|
||||
|
||||
for (const source of sourceNames) {
|
||||
const regex = new RegExp(`\\b${source}\\b`, 'gi');
|
||||
const regex = new RegExp(`\\b${source.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, 'gi');
|
||||
sanitized = sanitized.replace(regex, '');
|
||||
}
|
||||
|
||||
// Clean up multiple spaces and trailing commas
|
||||
sanitized = sanitized.replace(/\s{2,}/g, ' ').replace(/,\s*,/g, ',').trim();
|
||||
// Also remove "- site_name" patterns from titles (e.g. "Great News - Tamindir")
|
||||
sanitized = sanitized.replace(/\s*-\s*$/gm, '');
|
||||
|
||||
// Clean up multiple spaces, trailing commas, and orphaned punctuation
|
||||
sanitized = sanitized.replace(/\s{2,}/g, ' ').replace(/,\s*,/g, ',').replace(/\s+([.,;:!?])/g, '$1').trim();
|
||||
|
||||
return sanitized;
|
||||
}
|
||||
|
||||
@@ -502,21 +502,39 @@ TON: ${config.tone}${styleInstruction}${ctaInstruction}
|
||||
|
||||
Bu platform için özgün, ilgi çekici ve viral potansiyeli yüksek bir içerik oluştur.
|
||||
|
||||
KURALLAR:
|
||||
📈 SEO OPTİMİZASYONU (ZORUNLU):
|
||||
- Bu konuyu Google'da, YouTube'da veya sosyal medyada arayan biri hangi kelimeleri kullanır? O kelimeleri belirle ve içeriğe yerleştir.
|
||||
- İLK 2 CÜMLEDE arama hacmi en yüksek anahtar kelimeleri MUTLAKA kullan.
|
||||
- Hook/giriş cümlesi birincil anahtar kelimeyi içersin.
|
||||
- Anahtar kelimeleri doğal bir akış içinde kullan, zoraki tekrar yapma.
|
||||
- Konu ile ilgili en çok aranan terimleri, teknik terimleri ve marka/ürün adlarını (haberin konusu olan markaları — kaynak değil) ön plana çıkar.
|
||||
|
||||
KRİTİK KURALLAR:
|
||||
1. Karakter limitine uy
|
||||
2. Platformun tonuna uygun yaz
|
||||
3. Hook (dikkat çeken giriş) ile başla
|
||||
4. CTA ile bitir (yukarıdaki CTA talimatına göre)
|
||||
5. Emoji kullan ama aşırıya kaçma
|
||||
6. ${language === 'tr' ? 'Türkçe' : 'İngilizce'} yaz
|
||||
7. ASLA resim URL'i, medya linki veya [görsel] gibi yer tutucular ekleme
|
||||
8. Görsel betimlemeleri metnin içine YAZMA
|
||||
9. İçerik %100 özgün olmalı - asla kaynak kopyası yapma
|
||||
10. Kaynak linklerini, URL'leri veya atıfları ASLA ekleme
|
||||
11. Mevcut içeriklerden alıntı yapma, tamamen yeni ve orijinal yaz
|
||||
12. Bilgiyi kendi cümlelerinle ifade et, paraphrase bile yapma
|
||||
13. Araştırma kaynaklarının isimlerini (web siteleri, haber siteleri, markalar, gazeteler) ASLA metinde kullanma veya referans verme
|
||||
14. "...göre", "...kaynağına göre", "according to" gibi atıf ifadeleri ASLA kullanma
|
||||
7. İçerik %100 özgün olmalı - asla kaynak kopyası yapma
|
||||
8. Bilgiyi kendi cümlelerinle ifade et, paraphrase bile yapma
|
||||
|
||||
⚠️ YAYIN HAZIR İÇERİK (ÇOK ÖNEMLİ):
|
||||
- İçerik doğrudan kopyala-yapıştır ile yayınlanabilir olmalı
|
||||
- "[Buraya Link]", "[Link Ekle]", "[URL]", "[Görsel]", "[Video]" gibi YER TUTUCU İFADELER ASLA kullanma
|
||||
- Resim URL'i, medya linki veya placeholder ASLA ekleme
|
||||
- Görsel betimlemeleri metnin içine YAZMA
|
||||
- "Linke tıklayın", "Bio'daki linke gidin" gibi CTA'lar kullanabilirsin ama asla köşeli parantez içinde placeholder koyma
|
||||
- Eğer bir link veya URL bilmiyorsan, o kısmı tamamen atla — placeholder bırakma
|
||||
- İçerikte doldurulması gereken boşluk OLMAMALI
|
||||
|
||||
⛔ KAYNAK YASAĞI (EN ÖNEMLİ KURAL):
|
||||
- Hiçbir haber sitesi, web sitesi, gazete, ajans, blog veya medya kuruluşu adını ASLA yazma
|
||||
- "Tamindir", "Webtekno", "DonanımHaber", "ShiftDelete", "TechCrunch", "BBC", "CNN", "Reuters", "Forbes", "Bloomberg" gibi site/kaynak adlarını ASLA kullanma
|
||||
- "...haberlere göre", "...raporuna göre", "...kaynağına göre", "...sitesinde yer alan", "...çıkan haberlere göre", "according to", "...tarafından yapılan" gibi ATıF İFADELERİ ASLA kullanma
|
||||
- Haberin nereden alındığını BELİRTME, doğrudan bilgiyi kendi cümlelerinle anlat
|
||||
- İçerikte kaynak gösterme, referans verme veya atıf yapma YOK
|
||||
- Bilgi/veri paylaşırken kaynağı belirtmeden doğrudan bilgiyi ver
|
||||
|
||||
SADECE yayınlanacak metni yaz, açıklama veya başlık ekleme.`;
|
||||
|
||||
@@ -532,6 +550,107 @@ SADECE yayınlanacak metni yaz, açıklama veya başlık ekleme.`;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate relevant, SEO-optimized hashtags using AI
|
||||
* Replaces the old mock-based hashtag generation
|
||||
*/
|
||||
async generateAIHashtags(
|
||||
content: string,
|
||||
topic: string,
|
||||
platform: Platform,
|
||||
language: string = 'tr',
|
||||
): Promise<string[]> {
|
||||
const config = this.platforms[platform];
|
||||
if (!config || config.maxHashtags === 0) return [];
|
||||
|
||||
if (!this.gemini.isAvailable()) {
|
||||
this.logger.warn('Gemini not available for hashtag generation, using fallback');
|
||||
return this.generateFallbackHashtags(topic, config.maxHashtags);
|
||||
}
|
||||
|
||||
const maxCount = Math.min(config.maxHashtags, platform === 'instagram' ? 15 : config.maxHashtags);
|
||||
|
||||
const prompt = `Sen bir sosyal medya SEO uzmanısın. Aşağıdaki içerik ve konu için ${platform.toUpperCase()} platformunda kullanılacak EN UYGUN ${maxCount} hashtag üret.
|
||||
|
||||
KONU: ${topic}
|
||||
İÇERİK:
|
||||
${content.substring(0, 500)}
|
||||
|
||||
HASHTAG KURALLARI:
|
||||
1. Her hashtag DOĞRUDAN içerikle ilgili olmalı — genel veya ilişkisiz hashtag OLMASIN
|
||||
2. Arama hacmi yüksek, gerçek kullanıcıların arayacağı kelimelerden oluştur
|
||||
3. Konunun ana terimleri, teknik terimleri, marka/ürün adları (haberin konusu olanlar) ve sektör terimleri olsun
|
||||
4. "tips", "howto", "life", "community", "motivation", "goals" gibi genel son ekler KULLANMA
|
||||
5. ${language === 'tr' ? 'Türkçe ve İngilizce karışık olabilir, hangisi daha çok aranıyorsa onu seç' : 'Use English hashtags'}
|
||||
6. Hashtag'ı # ile başlat
|
||||
7. Tek kelime veya kısa bileşik kelimeler kullan (boşluk yok)
|
||||
8. Haber kaynağı olan sitelerin adlarını (Webtekno, Tamindir, DonanımHaber, ShiftDelete, TechCrunch, BBC, CNN vb.) ASLA hashtag olarak kullanma — bunlar bizim kaynağımız, içeriğimiz değil
|
||||
|
||||
SADECE hashtag listesini döndür, her satırda bir hashtag. Başka açıklama ekleme.
|
||||
ÖRNEK FORMAT:
|
||||
#hashtag1
|
||||
#hashtag2
|
||||
#hashtag3`;
|
||||
|
||||
try {
|
||||
const response = await this.gemini.generateText(prompt, {
|
||||
temperature: 0.4,
|
||||
maxTokens: 200,
|
||||
});
|
||||
|
||||
// Banned source names that should never appear as hashtags
|
||||
const bannedSources = [
|
||||
'tamindir', 'webtekno', 'donanımhaber', 'donanımhaber', 'shiftdelete',
|
||||
'technopat', 'chipsonline', 'chiponline', 'mediatrend', 'hürriyet',
|
||||
'milliyet', 'sabah', 'ntv', 'habertürk', 'sözcü', 'sozcu',
|
||||
'cumhuriyet', 'posta', 'aksam', 'takvim', 'mynet', 'ensonhaber',
|
||||
'haber7', 'internethaber', 'bbc', 'cnn', 'reuters', 'forbes',
|
||||
'bloomberg', 'techcrunch', 'theverge', 'engadget', 'wired',
|
||||
'gizmodo', 'mashable', 'businessinsider', 'cnbc', 'adhoçnews',
|
||||
'finanzennet', 'deraktionär', 'aktionar',
|
||||
];
|
||||
|
||||
const hashtags = response.text
|
||||
.split('\n')
|
||||
.map(line => line.trim())
|
||||
.filter(line => line.startsWith('#') && line.length > 1)
|
||||
.map(tag => tag.replace(/\s+/g, ''))
|
||||
.filter(tag => {
|
||||
const cleanTag = tag.replace('#', '').toLowerCase();
|
||||
return !bannedSources.some(source =>
|
||||
cleanTag === source || cleanTag === source.replace(/\s/g, '')
|
||||
);
|
||||
})
|
||||
.slice(0, maxCount);
|
||||
|
||||
if (hashtags.length === 0) {
|
||||
this.logger.warn('AI returned no valid hashtags, using fallback');
|
||||
return this.generateFallbackHashtags(topic, maxCount);
|
||||
}
|
||||
|
||||
this.logger.log(`AI generated ${hashtags.length} hashtags for ${platform}: ${hashtags.join(', ')}`);
|
||||
return hashtags;
|
||||
} catch (error) {
|
||||
this.logger.error(`AI hashtag generation failed: ${error.message}`);
|
||||
return this.generateFallbackHashtags(topic, maxCount);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fallback hashtag generation when AI is unavailable
|
||||
* Extracts meaningful words from the topic instead of appending generic suffixes
|
||||
*/
|
||||
private generateFallbackHashtags(topic: string, maxCount: number): string[] {
|
||||
const stopWords = new Set(['ve', 'ile', 'bir', 'bu', 'için', 'da', 'de', 'the', 'a', 'an', 'and', 'or', 'for', 'in', 'on', 'is', 'are', 'was', 'of', 'to']);
|
||||
return topic
|
||||
.toLowerCase()
|
||||
.replace(/[^a-zçğıöşü\w\s]/gi, '')
|
||||
.split(/\s+/)
|
||||
.filter(w => w.length > 3 && !stopWords.has(w))
|
||||
.slice(0, maxCount)
|
||||
.map(w => `#${w}`);
|
||||
}
|
||||
|
||||
private generateTemplateContent(
|
||||
topic: string,
|
||||
mainMessage: string,
|
||||
|
||||
@@ -270,7 +270,7 @@ export class ContentOptimizationService {
|
||||
if (!keyword) return 50;
|
||||
|
||||
const words = content.split(/\s+/).length;
|
||||
const kwCount = (content.toLowerCase().match(new RegExp(keyword.toLowerCase(), 'g')) || []).length;
|
||||
const kwCount = content.toLowerCase().split(keyword.toLowerCase()).length - 1;
|
||||
const density = (kwCount / words) * 100;
|
||||
|
||||
if (density >= this.optimalParams.keywordDensity.min &&
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
// Path: src/modules/trends/services/web-scraper.service.ts
|
||||
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
|
||||
export interface ScrapedContent {
|
||||
url: string;
|
||||
@@ -12,6 +13,7 @@ export interface ScrapedContent {
|
||||
headings: { level: number; text: string }[];
|
||||
links: { text: string; href: string; isExternal: boolean }[];
|
||||
images: { src: string; alt: string }[];
|
||||
videoLinks: string[];
|
||||
metadata: {
|
||||
author?: string;
|
||||
publishDate?: string;
|
||||
@@ -63,37 +65,362 @@ export interface ScraperOptions {
|
||||
export class WebScraperService {
|
||||
private readonly logger = new Logger(WebScraperService.name);
|
||||
private readonly contentCache = new Map<string, ScrapedContent>();
|
||||
private readonly defaultUserAgent = 'ContentHunter/1.0 (Research Bot)';
|
||||
private readonly defaultUserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
||||
|
||||
/**
|
||||
* Scrape content from a web page
|
||||
* Scrape content from a web page.
|
||||
* Automatically resolves Google News redirect URLs.
|
||||
* @param url The URL to scrape
|
||||
* @param options Scraper options
|
||||
* @param articleTitle The article title (used for Google News URL resolution via search)
|
||||
*/
|
||||
async scrapeUrl(url: string, options?: ScraperOptions): Promise<ScrapedContent | null> {
|
||||
async scrapeUrl(url: string, options?: ScraperOptions, articleTitle?: string): Promise<ScrapedContent | null> {
|
||||
// Validate URL
|
||||
if (!this.isValidUrl(url)) {
|
||||
this.logger.warn(`Invalid URL: ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Resolve Google News redirect URLs to actual article URLs
|
||||
let resolvedUrl = url;
|
||||
if (url.includes('news.google.com') || url.includes('google.com/rss')) {
|
||||
this.logger.log(`Detected Google News URL: ${url}`);
|
||||
|
||||
// Strategy 1: Use Puppeteer headless browser to follow JS redirects (most reliable)
|
||||
const puppeteerResult = await this.resolveGoogleNewsWithPuppeteer(url);
|
||||
if (puppeteerResult) {
|
||||
resolvedUrl = puppeteerResult;
|
||||
this.logger.log(`Puppeteer resolved Google News URL to: ${resolvedUrl}`);
|
||||
} else {
|
||||
// Strategy 2: Fall back to DuckDuckGo title search
|
||||
this.logger.warn('Puppeteer resolution failed, trying DuckDuckGo title search...');
|
||||
if (articleTitle) {
|
||||
const searchResult = await this.findArticleByTitle(articleTitle);
|
||||
if (searchResult) {
|
||||
resolvedUrl = searchResult;
|
||||
this.logger.log(`DuckDuckGo found article URL: ${resolvedUrl}`);
|
||||
} else {
|
||||
this.logger.warn('Both Puppeteer and DuckDuckGo failed. Cannot resolve Google News URL.');
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
this.logger.warn('No article title provided for fallback search. Cannot resolve Google News URL.');
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check cache
|
||||
const cached = this.contentCache.get(url);
|
||||
const cached = this.contentCache.get(resolvedUrl);
|
||||
if (cached && this.isCacheValid(cached)) {
|
||||
return cached;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await this.fetchPage(url, options);
|
||||
if (!response) return null;
|
||||
this.logger.log(`Scraping URL: ${resolvedUrl}`);
|
||||
const response = await this.fetchPage(resolvedUrl, options);
|
||||
if (!response) {
|
||||
this.logger.warn(`fetchPage returned null for ${resolvedUrl}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const content = this.parseHtml(response.html, url, options);
|
||||
this.logger.log(`Fetched HTML: ${response.html.length} chars`);
|
||||
const content = this.parseHtml(response.html, resolvedUrl, options);
|
||||
content.html = options?.includeHtml ? response.html : '';
|
||||
|
||||
// Cache the result
|
||||
this.contentCache.set(url, content);
|
||||
this.contentCache.set(resolvedUrl, content);
|
||||
|
||||
this.logger.log(`Scraped successfully: ${content.title}, ${content.images.length} images, ${content.videoLinks.length} videos, ${content.wordCount} words`);
|
||||
return content;
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to scrape ${url}:`, error);
|
||||
this.logger.error(`Failed to scrape ${resolvedUrl}:`, error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve a Google News redirect URL to the actual article URL using Puppeteer.
|
||||
* Google News uses JavaScript-only redirects that cannot be followed via HTTP.
|
||||
* Puppeteer launches a headless browser to follow the redirect.
|
||||
*/
|
||||
private async resolveGoogleNewsWithPuppeteer(googleNewsUrl: string): Promise<string | null> {
|
||||
let browser: puppeteer.Browser | null = null;
|
||||
try {
|
||||
this.logger.log(`Launching Puppeteer to resolve Google News URL...`);
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
'--disable-extensions',
|
||||
'--disable-background-networking',
|
||||
'--window-size=1280,720',
|
||||
],
|
||||
timeout: 20000,
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Set a realistic user agent
|
||||
await page.setUserAgent(this.defaultUserAgent);
|
||||
|
||||
// Block unnecessary resources to speed up loading
|
||||
await page.setRequestInterception(true);
|
||||
page.on('request', (request) => {
|
||||
const resourceType = request.resourceType();
|
||||
if (['image', 'stylesheet', 'font', 'media'].includes(resourceType)) {
|
||||
request.abort();
|
||||
} else {
|
||||
request.continue();
|
||||
}
|
||||
});
|
||||
|
||||
// Navigate to the Google News URL
|
||||
await page.goto(googleNewsUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 15000,
|
||||
});
|
||||
|
||||
// Wait for the redirect to complete — check periodically if URL changed
|
||||
let finalUrl = page.url();
|
||||
const startTime = Date.now();
|
||||
const maxWait = 10000; // 10 seconds max
|
||||
|
||||
while (finalUrl.includes('news.google.com') && (Date.now() - startTime) < maxWait) {
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
finalUrl = page.url();
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
browser = null;
|
||||
|
||||
// Check if we successfully left Google News
|
||||
if (!finalUrl.includes('news.google.com') && !finalUrl.includes('consent.google.com')) {
|
||||
this.logger.log(`Puppeteer resolved to: ${finalUrl}`);
|
||||
return finalUrl;
|
||||
} else {
|
||||
this.logger.warn(`Puppeteer could not resolve - still on Google domain: ${finalUrl}`);
|
||||
return null;
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.warn(`Puppeteer resolution failed: ${error.message}`);
|
||||
return null;
|
||||
} finally {
|
||||
if (browser) {
|
||||
try { await browser.close(); } catch (e) { /* ignore */ }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find actual article URL by searching for the article title.
|
||||
* Extracts the source name from the title (e.g., "Title - Webtekno" → searches "site:webtekno.com Title")
|
||||
* Uses DuckDuckGo HTML search (more bot-friendly than Google)
|
||||
*/
|
||||
private async findArticleByTitle(title: string): Promise<string | null> {
|
||||
try {
|
||||
// Extract source name from title (usually at the end after " - ")
|
||||
const parts = title.split(/\s+-\s+/);
|
||||
const sourceName = parts.length > 1 ? parts[parts.length - 1].trim() : '';
|
||||
const cleanTitle = parts.length > 1 ? parts.slice(0, -1).join(' - ').trim() : title;
|
||||
|
||||
// Remove brackets and special chars from title for better search
|
||||
const searchableTitle = cleanTitle
|
||||
.replace(/\[.*?\]/g, '')
|
||||
.replace(/[^\w\s\u00C0-\u024F\u0100-\u017F\u011E-\u011F\u0130-\u0131\u015E-\u015F\u00D6\u00F6\u00DC\u00FC\u00C7\u00E7]/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
|
||||
// Build search query — prefer site: filter if we know the source
|
||||
const sourceDomain = sourceName.toLowerCase().replace(/\s+/g, '');
|
||||
const siteFilter = sourceName ? `${sourceDomain}.com ` : '';
|
||||
const searchQuery = `${siteFilter}${searchableTitle}`;
|
||||
|
||||
this.logger.log(`Searching DuckDuckGo for article: ${searchQuery}`);
|
||||
|
||||
// Use DuckDuckGo HTML search (more bot-friendly)
|
||||
const searchUrl = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(searchQuery)}`;
|
||||
const response = await fetch(searchUrl, {
|
||||
headers: {
|
||||
'User-Agent': this.defaultUserAgent,
|
||||
'Accept': 'text/html,application/xhtml+xml',
|
||||
'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
this.logger.warn(`DuckDuckGo search returned ${response.status}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
|
||||
// DuckDuckGo HTML results contain article URLs in uddg= parameters
|
||||
const ddgPattern = /uddg=(https?[^&"]+)/g;
|
||||
const foundUrls: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
let match;
|
||||
while ((match = ddgPattern.exec(html)) !== null) {
|
||||
const foundUrl = decodeURIComponent(match[1]);
|
||||
if (!seen.has(foundUrl) && !foundUrl.includes('duckduckgo.com') && !foundUrl.includes('google.com')) {
|
||||
seen.add(foundUrl);
|
||||
foundUrls.push(foundUrl);
|
||||
}
|
||||
}
|
||||
|
||||
if (foundUrls.length > 0) {
|
||||
// Prefer URLs matching the source name
|
||||
if (sourceName) {
|
||||
const sourceUrl = foundUrls.find(u => u.toLowerCase().includes(sourceDomain));
|
||||
if (sourceUrl) {
|
||||
this.logger.log(`Found matching source URL via DuckDuckGo: ${sourceUrl}`);
|
||||
return sourceUrl;
|
||||
}
|
||||
}
|
||||
this.logger.log(`Using first DuckDuckGo result: ${foundUrls[0]}`);
|
||||
return foundUrls[0];
|
||||
}
|
||||
|
||||
this.logger.warn('No search results found for article title');
|
||||
return null;
|
||||
} catch (error) {
|
||||
this.logger.warn(`Article search failed: ${error.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve redirect URLs (especially Google News) to the final destination URL.
|
||||
* Google News RSS URLs encode the actual article URL in a base64 segment in the path.
|
||||
*/
|
||||
private async resolveRedirectUrl(url: string): Promise<string | null> {
|
||||
// Strategy 1: Decode Google News base64-encoded URL from the path
|
||||
try {
|
||||
const decoded = this.decodeGoogleNewsUrl(url);
|
||||
if (decoded) {
|
||||
this.logger.log(`Decoded Google News URL: ${decoded}`);
|
||||
return decoded;
|
||||
}
|
||||
} catch (e) {
|
||||
this.logger.warn(`Base64 decode failed: ${e.message}`);
|
||||
}
|
||||
|
||||
// Strategy 2: Follow HTTP redirects
|
||||
try {
|
||||
// First try with redirect: 'manual' to get Location header
|
||||
const headResponse = await fetch(url, {
|
||||
method: 'HEAD',
|
||||
headers: {
|
||||
'User-Agent': this.defaultUserAgent,
|
||||
'Accept': 'text/html',
|
||||
},
|
||||
redirect: 'manual',
|
||||
});
|
||||
|
||||
const locationHeader = headResponse.headers.get('location');
|
||||
if (locationHeader && !locationHeader.includes('news.google.com')) {
|
||||
this.logger.log(`Redirect Location header: ${locationHeader}`);
|
||||
return locationHeader;
|
||||
}
|
||||
|
||||
// Try full GET with redirect follow
|
||||
const getResponse = await fetch(url, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'User-Agent': this.defaultUserAgent,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
},
|
||||
redirect: 'follow',
|
||||
});
|
||||
|
||||
if (getResponse.url && !getResponse.url.includes('news.google.com')) {
|
||||
return getResponse.url;
|
||||
}
|
||||
|
||||
// Strategy 3: Parse HTML for article link / meta refresh / canonical
|
||||
const html = await getResponse.text();
|
||||
|
||||
// Check data-redirect attribute
|
||||
const dataRedirect = html.match(/data-(?:redirect|href|url)=["'](https?:\/\/(?!news\.google\.com)[^"']+)["']/i);
|
||||
if (dataRedirect) return dataRedirect[1];
|
||||
|
||||
// Meta refresh
|
||||
const metaRefresh = html.match(/<meta[^>]*http-equiv=["']refresh["'][^>]*content=["']\d+;\s*url=([^"']+)["']/i);
|
||||
if (metaRefresh) return metaRefresh[1];
|
||||
|
||||
// Canonical
|
||||
const canonical = html.match(/<link[^>]*rel=["']canonical["'][^>]*href=["']([^"']+)["']/i);
|
||||
if (canonical && !canonical[1].includes('news.google.com')) return canonical[1];
|
||||
|
||||
// Any external link that looks like an article
|
||||
const externalLink = html.match(/href=["'](https?:\/\/(?!(?:news|www)\.google\.com)[^"']+)["']/i);
|
||||
if (externalLink) return externalLink[1];
|
||||
|
||||
this.logger.warn(`Could not resolve Google News URL, returning original`);
|
||||
return getResponse.url;
|
||||
} catch (error) {
|
||||
this.logger.warn(`Failed to resolve redirect for ${url}: ${error.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode actual article URL from Google News RSS URL.
|
||||
* Google News encodes URLs in the path segment as base64.
|
||||
* Format: https://news.google.com/rss/articles/CBMi{base64payload}
|
||||
*/
|
||||
private decodeGoogleNewsUrl(googleUrl: string): string | null {
|
||||
try {
|
||||
// Extract the base64 part from the URL path
|
||||
const urlObj = new URL(googleUrl);
|
||||
const pathParts = urlObj.pathname.split('/');
|
||||
// Find the article ID part (after /articles/)
|
||||
const articlesIndex = pathParts.indexOf('articles');
|
||||
if (articlesIndex === -1 || articlesIndex + 1 >= pathParts.length) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let articleId = pathParts[articlesIndex + 1];
|
||||
// Remove query params if they got attached
|
||||
if (articleId.includes('?')) {
|
||||
articleId = articleId.split('?')[0];
|
||||
}
|
||||
|
||||
// The article ID starts with "CBMi" prefix, try to decode the base64
|
||||
// Make base64 URL-safe: replace - with + and _ with /
|
||||
let base64 = articleId
|
||||
.replace(/-/g, '+')
|
||||
.replace(/_/g, '/');
|
||||
|
||||
// Add padding if needed
|
||||
while (base64.length % 4 !== 0) {
|
||||
base64 += '=';
|
||||
}
|
||||
|
||||
// Decode base64
|
||||
const decoded = Buffer.from(base64, 'base64').toString('utf-8');
|
||||
|
||||
// Extract URLs from the decoded string using regex
|
||||
const urlMatches = decoded.match(/https?:\/\/[^\s"'<>\x00-\x1F]+/g);
|
||||
if (urlMatches && urlMatches.length > 0) {
|
||||
// Filter out Google URLs
|
||||
const nonGoogleUrl = urlMatches.find(u => !u.includes('google.com'));
|
||||
if (nonGoogleUrl) {
|
||||
// Clean up any trailing garbage characters
|
||||
const cleanUrl = nonGoogleUrl.replace(/[\x00-\x1F\x7F-\x9F]+.*$/, '');
|
||||
this.logger.log(`Decoded article URL from base64: ${cleanUrl}`);
|
||||
return cleanUrl;
|
||||
}
|
||||
return urlMatches[0];
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
this.logger.warn(`Failed to decode Google News URL: ${error.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -129,64 +456,57 @@ export class WebScraperService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch page content (simulated)
|
||||
* Fetch page content using real HTTP fetch
|
||||
*/
|
||||
private async fetchPage(url: string, options?: ScraperOptions): Promise<{ html: string } | null> {
|
||||
// In production, use:
|
||||
// 1. node-fetch or axios for simple pages
|
||||
// 2. Puppeteer/Playwright for JavaScript-rendered pages
|
||||
// 3. Cheerio for HTML parsing
|
||||
const timeout = options?.timeout || 15000;
|
||||
const userAgent = options?.userAgent || this.defaultUserAgent;
|
||||
|
||||
// Simulated HTML for demonstration
|
||||
const mockHtml = `
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Sample Article: Content Creation Strategies</title>
|
||||
<meta name="description" content="Learn the best content creation strategies for 2024">
|
||||
<meta name="author" content="John Doe">
|
||||
<meta name="keywords" content="content, creation, marketing, strategy">
|
||||
<meta property="og:title" content="Content Creation Strategies">
|
||||
<meta property="og:description" content="Master content creation with these proven strategies">
|
||||
<meta property="og:image" content="https://example.com/image.jpg">
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
<h1>10 Content Creation Strategies for 2024</h1>
|
||||
<p class="author">By John Doe | Published: January 15, 2024</p>
|
||||
try {
|
||||
const controller = new AbortController();
|
||||
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
||||
|
||||
<h2>Introduction</h2>
|
||||
<p>Content creation has evolved significantly over the past year. In this comprehensive guide, we'll explore the most effective strategies for creating engaging content.</p>
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': userAgent,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Accept-Encoding': 'identity',
|
||||
},
|
||||
signal: controller.signal,
|
||||
redirect: 'follow',
|
||||
});
|
||||
|
||||
<h2>1. Focus on Value First</h2>
|
||||
<p>The most successful content creators prioritize providing value to their audience. According to a recent study, 78% of consumers prefer brands that create custom content.</p>
|
||||
clearTimeout(timeoutId);
|
||||
|
||||
<h2>2. Embrace Short-Form Video</h2>
|
||||
<p>Short-form video continues to dominate. TikTok and Instagram Reels have shown that 15-60 second videos can generate massive engagement.</p>
|
||||
if (!response.ok) {
|
||||
this.logger.warn(`HTTP ${response.status} for ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
<blockquote>"Content is king, but distribution is queen." - Gary Vaynerchuk</blockquote>
|
||||
const contentType = response.headers.get('content-type') || '';
|
||||
if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
|
||||
this.logger.warn(`Non-HTML content type: ${contentType} for ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
<h2>3. Use AI Wisely</h2>
|
||||
<p>AI tools like ChatGPT and Claude can help with ideation and drafting, but human creativity remains essential for authentic content.</p>
|
||||
const html = await response.text();
|
||||
|
||||
<h3>Key Statistics</h3>
|
||||
<ul>
|
||||
<li>85% of marketers use content marketing</li>
|
||||
<li>Video content generates 1200% more shares</li>
|
||||
<li>Long-form content gets 77% more backlinks</li>
|
||||
</ul>
|
||||
if (!html || html.length < 100) {
|
||||
this.logger.warn(`Empty or very short response from ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
<h2>Conclusion</h2>
|
||||
<p>Success in content creation requires a balance of strategy, creativity, and consistency. Start implementing these strategies today!</p>
|
||||
|
||||
<a href="/related-article">Read more articles</a>
|
||||
<a href="https://external.com/resource">External resource</a>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
return { html: mockHtml };
|
||||
this.logger.log(`Successfully fetched ${url} (${html.length} chars)`);
|
||||
return { html };
|
||||
} catch (error) {
|
||||
if (error.name === 'AbortError') {
|
||||
this.logger.warn(`Request timed out for ${url}`);
|
||||
} else {
|
||||
this.logger.error(`Failed to fetch ${url}: ${error.message}`);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -194,6 +514,7 @@ export class WebScraperService {
|
||||
*/
|
||||
private parseHtml(html: string, url: string, options?: ScraperOptions): ScrapedContent {
|
||||
const domain = new URL(url).hostname;
|
||||
const baseUrl = new URL(url).origin;
|
||||
|
||||
// Extract title
|
||||
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
||||
@@ -212,8 +533,11 @@ export class WebScraperService {
|
||||
// Extract links
|
||||
const links = options?.extractLinks !== false ? this.extractLinks(html, domain) : [];
|
||||
|
||||
// Extract images
|
||||
const images = options?.extractImages !== false ? this.extractImages(html) : [];
|
||||
// Extract images with absolute URL resolution
|
||||
const images = options?.extractImages !== false ? this.extractImages(html, baseUrl) : [];
|
||||
|
||||
// Extract video links (YouTube, etc.)
|
||||
const videoLinks = this.extractVideoLinks(html);
|
||||
|
||||
// Extract metadata
|
||||
const metadata = this.extractMetadata(html);
|
||||
@@ -232,6 +556,7 @@ export class WebScraperService {
|
||||
headings,
|
||||
links,
|
||||
images,
|
||||
videoLinks,
|
||||
metadata,
|
||||
wordCount,
|
||||
readingTime,
|
||||
@@ -306,21 +631,82 @@ export class WebScraperService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images from HTML
|
||||
* Extract images from HTML with absolute URL resolution
|
||||
*/
|
||||
private extractImages(html: string): { src: string; alt: string }[] {
|
||||
private extractImages(html: string, baseUrl?: string): { src: string; alt: string }[] {
|
||||
const images: { src: string; alt: string }[] = [];
|
||||
// Match src before alt, or alt before src
|
||||
const regex = /<img[^>]*src=["']([^"']+)["'][^>]*(?:alt=["']([^"']*)["'])?/gi;
|
||||
const regex2 = /<img[^>]*alt=["']([^"']*)["'][^>]*src=["']([^"']+)["']/gi;
|
||||
let match;
|
||||
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
images.push({
|
||||
src: match[1],
|
||||
alt: match[2] || '',
|
||||
});
|
||||
const addImage = (src: string, alt: string) => {
|
||||
// Skip tiny tracking pixels, icons, and data URIs
|
||||
if (src.includes('1x1') || src.includes('pixel') || src.includes('data:image/gif')) return;
|
||||
if (src.endsWith('.svg') || src.endsWith('.ico')) return;
|
||||
|
||||
// Resolve relative URLs
|
||||
let resolvedSrc = src;
|
||||
if (baseUrl && !src.startsWith('http') && !src.startsWith('//')) {
|
||||
resolvedSrc = src.startsWith('/') ? `${baseUrl}${src}` : `${baseUrl}/${src}`;
|
||||
} else if (src.startsWith('//')) {
|
||||
resolvedSrc = `https:${src}`;
|
||||
}
|
||||
|
||||
return images.slice(0, 20); // Limit to 20 images
|
||||
// Avoid duplicates
|
||||
if (!images.some(img => img.src === resolvedSrc)) {
|
||||
images.push({ src: resolvedSrc, alt: alt || '' });
|
||||
}
|
||||
};
|
||||
|
||||
while ((match = regex.exec(html)) !== null) {
|
||||
addImage(match[1], match[2] || '');
|
||||
}
|
||||
while ((match = regex2.exec(html)) !== null) {
|
||||
addImage(match[2], match[1] || '');
|
||||
}
|
||||
|
||||
// Also check og:image
|
||||
const ogImageMatch = html.match(/<meta[^>]*property=["']og:image["'][^>]*content=["']([^"']+)["']/i);
|
||||
if (ogImageMatch) {
|
||||
addImage(ogImageMatch[1], 'og-image');
|
||||
}
|
||||
|
||||
return images.slice(0, 20);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract video links (YouTube, Vimeo, etc.) from HTML
|
||||
*/
|
||||
private extractVideoLinks(html: string): string[] {
|
||||
const videos: Set<string> = new Set();
|
||||
|
||||
// YouTube iframe embeds
|
||||
const iframeRegex = /<iframe[^>]*src=["']([^"']*(?:youtube\.com|youtu\.be|vimeo\.com)[^"']*)["']/gi;
|
||||
let match;
|
||||
while ((match = iframeRegex.exec(html)) !== null) {
|
||||
let url = match[1];
|
||||
if (url.startsWith('//')) url = `https:${url}`;
|
||||
// Convert embed URL to watch URL
|
||||
url = url.replace('/embed/', '/watch?v=').replace('?feature=oembed', '');
|
||||
videos.add(url);
|
||||
}
|
||||
|
||||
// YouTube links in anchors
|
||||
const anchorRegex = /<a[^>]*href=["']([^"']*(?:youtube\.com\/watch|youtu\.be\/)[^"']*)["']/gi;
|
||||
while ((match = anchorRegex.exec(html)) !== null) {
|
||||
videos.add(match[1]);
|
||||
}
|
||||
|
||||
// data-video-url or data-src attributes
|
||||
const dataRegex = /data-(?:video-url|src)=["']([^"']*(?:youtube\.com|youtu\.be|vimeo\.com)[^"']*)["']/gi;
|
||||
while ((match = dataRegex.exec(html)) !== null) {
|
||||
let url = match[1];
|
||||
if (url.startsWith('//')) url = `https:${url}`;
|
||||
videos.add(url);
|
||||
}
|
||||
|
||||
return [...videos].slice(0, 10);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user