diff --git a/.gitignore b/.gitignore index b4515c0..ae13ee5 100644 --- a/.gitignore +++ b/.gitignore @@ -310,4 +310,6 @@ poetry.toml # LSP config files pyrightconfig.json +.env + # End of https://www.toptal.com/developers/gitignore/api/python,node \ No newline at end of file diff --git a/chrome_crawl_extension/popup.js b/chrome_crawl_extension/popup.js index b0c0a3b..81fbb4f 100644 --- a/chrome_crawl_extension/popup.js +++ b/chrome_crawl_extension/popup.js @@ -1,272 +1,321 @@ +/* ────────────────────────────────────────────────────────────── + popup.js — 초단단 수집 + 중복제거 통합(프레임 내부/외부 모두) + - 블록: article/section/main/div/p/li/h1~h6/figure/table/caption + - 블록별 구분자: '#', 단어 구분자: '*' + - 가시 요소만 수집(visibility/display/size) + - 리프 블록만 채택 + - 프레임 내부 1차 중복제거(블록 내부/블록 단위) + - 전 프레임 통합 2차 중복제거 + - 폴백: body.innerText +────────────────────────────────────────────────────────────── */ + const statusEl = document.getElementById('status'); const extractBtn = document.getElementById('extractBtn'); - -function setStatus(msg) { - statusEl.textContent = msg; -} - +function setStatus(m){ if(statusEl) statusEl.textContent=m; } let inFlight = false; -async function getActiveTab() { - const [tab] = await chrome.tabs.query({ active: true, currentWindow: true }); +/* 활성 탭 */ +async function getActiveTab(){ + const [tab] = await chrome.tabs.query({ active:true, currentWindow:true }); return tab; } -function sanitizeFilename(name) { - return (name || 'page').replace(/[\\/:*?"<>|]+/g, '_').trim(); + +/* ===== 전 프레임 통합 dedupe(확장 컨텍스트에서 수행) ===== */ +function canonKeyGlobal(s){ + return (s || '') + .normalize('NFKC') + .toLowerCase() + .replace(/[^0-9\p{L}]+/gu, ' ') + .replace(/\s+/g, ' ') + .trim(); } -/* ========================= - 프레임별 텍스트 추출 - ========================= */ -function frameExtractorClean() { - const EXCLUDE_TAGS = new Set([ - 'SCRIPT', 'STYLE', 'NOSCRIPT', 'TEMPLATE', - 'PRE', 'CODE', 'KBD', 'SAMP', - 'TEXTAREA', 'INPUT', 'SELECT', 'BUTTON', - 'SVG', 'CANVAS', 'IFRAME' - ]); - const EXCLUDE_CLASS_RE = /(code|syntax|prettyprint|hljs|gist|prism|highlight)/i; - - function looksCodey(text) { - if (!text) return false; - const t = text.trim(); - if (t.length < 40) return false; - - const jsKeywords = /(function|var\s|const\s|let\s|return|=>|parseInt|document|window|eval|JSON|\bfor\s*\(|while\s*\(|try\s*\{|catch\s*\(|finally|Object\.|Array\.|Math\.)/; - const hexPattern = /0x[0-9a-fA-F]+/; - const longRunNoSpace = /\S{60,}/; - const manySemicolons = /(;|\{|\}|\(|\)){6,}/; - const base64ish = /[A-Za-z0-9+/=]{80,}/; - - let score = 0; - if (jsKeywords.test(t)) score++; - if (hexPattern.test(t)) score++; - if (longRunNoSpace.test(t)) score++; - if (manySemicolons.test(t)) score++; - if (base64ish.test(t)) score++; - - const cleaned = t.replace(/[A-Za-z0-9가-힣\s.,:;!?'"()\-_/]/g, ''); - const nonWordRatio = cleaned.length / t.length; - if (nonWordRatio > 0.35) score++; - - return score >= 2; +function dedupeBlocksGlobal(blocks){ + const seen = new Set(); + const out = []; + for(const b of blocks){ + const k = canonKeyGlobal(b); + if(!k) continue; + if(seen.has(k)) continue; + seen.add(k); + out.push(b); } + return out; +} - function isHiddenElement(el) { - const style = getComputedStyle(el); - return ( - style.display === 'none' || - style.visibility === 'hidden' || - style.opacity === '0' - ); +function dedupeInsideBlockGlobal(block){ + if(!block) return ''; + const toks = block.split('*').map(t => t.trim()).filter(Boolean); + const seen = new Set(); + const out = []; + for(const t of toks){ + const k = canonKeyGlobal(t); + if(!k) continue; + if(seen.has(k)) continue; + seen.add(k); + out.push(t); } + return out.join('*'); +} - function shouldSkipElement(el) { - if (!el || EXCLUDE_TAGS.has(el.tagName)) return true; - if (el.getAttribute('aria-hidden') === 'true') return true; - if (isHiddenElement(el)) return true; - if (EXCLUDE_CLASS_RE.test(el.className || '')) return true; - const style = getComputedStyle(el); - if (style.whiteSpace.includes('pre')) return true; - if ((style.fontFamily || '').toLowerCase().includes('mono')) return true; - return false; +function mergeFramesAndDedupe(frameResults){ + const allBlocks = []; + for(const r of frameResults){ + const text = (r?.text || '').trim(); + if(!text) continue; + const bs = text.split('#').map(s => s.trim()).filter(Boolean); + allBlocks.push(...bs); } + const normalizedBlocks = allBlocks.map(dedupeInsideBlockGlobal); + const uniqBlocks = dedupeBlocksGlobal(normalizedBlocks); + return uniqBlocks.join('#'); +} - function collectText(root) { +/* ===== 프레임 내부에서 실행되는 함수(주입 함수) ===== */ +function frameCollectByBlocks(){ + // ── 프레임 내부에서도 사용 가능한 dedupe 유틸(여기 정의 必) + function canonKeyLocal(s){ + return (s || '') + .normalize('NFKC') + .toLowerCase() + .replace(/[^0-9\p{L}]+/gu, ' ') + .replace(/\s+/g, ' ') + .trim(); + } + function dedupeBlocksLocal(blocks){ + const seen = new Set(); + const out = []; + for(const b of blocks){ + const k = canonKeyLocal(b); + if(!k) continue; + if(seen.has(k)) continue; + seen.add(k); + out.push(b); + } + return out; + } + function dedupeInsideBlockLocal(block){ + if(!block) return ''; + const toks = block.split('*').map(t => t.trim()).filter(Boolean); + const seen = new Set(); const out = []; + for(const t of toks){ + const k = canonKeyLocal(t); + if(!k) continue; + if(seen.has(k)) continue; + seen.add(k); + out.push(t); + } + return out.join('*'); + } + + const EXCLUDE = new Set([ + 'SCRIPT','STYLE','NOSCRIPT','TEMPLATE', + 'IFRAME','SVG','CANVAS','CODE','PRE','KBD','SAMP','TEXTAREA','INPUT','SELECT','BUTTON','LABEL','FORM','NAV','MENU' + ]); + + const BLOCK_SEL = [ + 'article','section','main','[role="main"]', + 'div','p','li', + 'h1','h2','h3','h4','h5','h6', + 'figure','figcaption','table','caption' + ].join(','); + + const URL_RE = /\b(?:https?:\/\/|www\.)\S+/gi; + + function isVisible(el){ + if(!el) return false; + if(EXCLUDE.has(el.tagName)) return false; + const s = getComputedStyle(el); + if(s.display==='none' || s.visibility==='hidden' || s.opacity==='0') return false; + const r = el.getBoundingClientRect(); + if((r.width<=0 || r.height<=0) && (el.offsetWidth<=0 || el.offsetHeight<=0)) return false; + return true; + } + + function extractText(el){ const walker = document.createTreeWalker( - root, - NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, + el, + NodeFilter.SHOW_TEXT, { - acceptNode(node) { - if (node.nodeType === Node.ELEMENT_NODE) { - const el = /** @type {Element} */ (node); - return shouldSkipElement(el) - ? NodeFilter.FILTER_REJECT - : NodeFilter.FILTER_SKIP; - } else if (node.nodeType === Node.TEXT_NODE) { - return NodeFilter.FILTER_ACCEPT; + acceptNode(n){ + const t = n.nodeValue || ''; + if(!t.trim()) return NodeFilter.FILTER_REJECT; + let p = n.parentElement; + while(p){ + if(EXCLUDE.has(p.tagName)) return NodeFilter.FILTER_REJECT; + p = p.parentElement; } - return NodeFilter.FILTER_SKIP; + return NodeFilter.FILTER_ACCEPT; } } ); + let parts = [], node = walker.nextNode(); + while(node){ + parts.push(node.nodeValue); + node = walker.nextNode(); + } + let txt = parts.join(' ').replace(URL_RE,' ').replace(/\s+/g,' ').trim(); + return txt; + } - let current = walker.currentNode; - while (current) { - if (current.nodeType === Node.TEXT_NODE) { - const t = current.nodeValue || ''; - const s = t.replace(/\s+/g, ' ').trim(); - if (s && !looksCodey(s)) out.push(s); + function pickLeafBlocks(nodes){ + const set = new Set(nodes); + const keep = []; + for(const el of nodes){ + let hasTextChild = false; + for(const ch of el.querySelectorAll('div,p,li,h1,h2,h3,h4,h5,h6,figure,figcaption,table,caption')){ + if(set.has(ch)) continue; + const t = (ch.textContent||'').trim(); + if(t.length>8){ hasTextChild = true; break; } } - current = walker.nextNode(); + if(!hasTextChild) keep.push(el); } - return out; + return keep; } - const doc = document; - const title = doc.title || ''; - const url = location.href || ''; - let texts = []; + function toStarWords(text){ + if(!text) return ''; + return text.split(/\s+/).filter(Boolean).join('*'); + } - try { - if (doc.body) texts = collectText(doc.body); - } catch (_) {} + function getTitle(){ + const og = document.querySelector('meta[property="og:title"]')?.content?.trim(); + const tw = document.querySelector('meta[name="twitter:title"]')?.content?.trim(); + const h1 = document.querySelector('h1')?.textContent?.trim(); + const dt = document.title || ''; + return [og,tw,h1,dt].filter(Boolean)[0] || ''; + } - const merged = texts.join(' ').replace(/\s+/g, ' ').trim(); - const pretty = merged.replace(/([\.!\?])\s+/g, '$1\n'); + const all = Array.from(document.querySelectorAll(BLOCK_SEL)).filter(isVisible); + const texty = all.filter(el => (el.textContent||'').trim().length >= 2); + const leafs = pickLeafBlocks(texty); - // 링크/URL 목록은 반환하지 않음 - return { frameUrl: url, title, text: pretty }; -} + let blocks = []; + const title = getTitle(); + if(title) blocks.push(toStarWords(title)); -/* ========================= - 파일 저장용 문자열 조립 - ========================= */ -function assembleOutput(tabInfo, frameResults) { - const ts = new Date().toISOString(); - const parts = []; - - parts.push(`# Page Snapshot -- Collected At: ${ts} -- Tab URL: ${tabInfo.url || ''} -- Tab Title: ${tabInfo.title || ''} -- Frames Collected: ${frameResults.length} -`); - - const texts = []; - frameResults.forEach((r, i) => { - if (r.text && r.text.trim()) { - texts.push(`\n---\n[Frame ${i+1}] ${r.frameUrl}\n\n${r.text.trim()}`); + for(const el of leafs){ + const t = extractText(el); + if(!t) continue; + if(t.length < 2) continue; + blocks.push(toStarWords(t)); + } + + if(blocks.length === 0){ + const t = (document.body?.innerText || '').replace(/\s+/g,' ').trim(); + if(t){ + const star = toStarWords(t); + if(star) blocks.push(star); } - }); - const fullText = texts.join('\n'); + } - // URL 섹션 완전 제거 - parts.push(`\n\n# TEXT (All Frames, Cleaned)\n${fullText || '(no text)'}\n`); + // 프레임 내부 1차 정리: 블록 내부/블록 단위 중복 제거 + blocks = blocks.map(dedupeInsideBlockLocal); + blocks = dedupeBlocksLocal(blocks); - return parts.join(''); + return { + frameUrl: location.href || '', + title: title || '', + text: blocks.join('#') + }; } -/* ========================= - 서버 전송 세팅 - ========================= */ -const API_BASE = "http://localhost:8000"; // 배포 시 실제 API 주소로 교체 -const API_KEY = ""; // server/.env에 API_KEY 설정했다면 동일 값 입력 (없으면 빈 문자열) +/* ===== 디버그(선택) ===== */ +function assembleOutput(tabInfo, frameResults){ + const ts = new Date().toISOString(); + return [ + `# Snapshot @ ${ts}`, + `Tab: ${tabInfo.title || ''} | ${tabInfo.url || ''}`, + `Frames: ${frameResults.length}`, + frameResults.map((r,i)=>`---\n[Frame ${i+1}] ${r.frameUrl}\n${(r.text||'').slice(0,800)}...`).join('\n') + ].join('\n'); +} -async function sendToApi(payload) { +/* ===== 서버 전송 ===== */ +const API_BASE = "http://localhost:8000"; +const API_KEY = ""; +async function sendToApi(payload){ const headers = { "Content-Type": "application/json" }; - if (API_KEY) headers["x-api-key"] = API_KEY; - - const res = await fetch(`${API_BASE}/collect`, { - method: "POST", - headers, - body: JSON.stringify(payload) + if(API_KEY) headers["x-api-key"] = API_KEY; + const res = await fetch(`${API_BASE}/collect`,{ + method:'POST', headers, body: JSON.stringify(payload) }); - if (!res.ok) { - const msg = await res.text().catch(() => ""); - throw new Error(`API error ${res.status}: ${msg}`); + if(!res.ok){ + const msg = await res.text().catch(()=> ""); + throw new Error(`API ${res.status}: ${msg}`); } return res.json(); } -/* ========================= - 메인 핸들러 - ========================= */ -extractBtn.addEventListener('click', async () => { - if (inFlight) return; // 중복 클릭 무시 +/* ===== 메인 핸들러 ===== */ +extractBtn.addEventListener('click', async ()=>{ + if(inFlight) return; inFlight = true; - setStatus('페이지 추출 중...'); + setStatus('수집 중...'); - try { + try{ const tab = await getActiveTab(); - if (!tab?.id) { - setStatus('활성 탭을 찾지 못했습니다.'); - return; - } + if(!tab?.id){ setStatus('활성 탭 없음'); return; } - const injectionResults = await chrome.scripting.executeScript({ - target: { tabId: tab.id, allFrames: true }, - func: frameExtractorClean + const results = await chrome.scripting.executeScript({ + target:{ tabId: tab.id, allFrames: true }, + func: frameCollectByBlocks }); + const frameResults = (results||[]).map(r=>r.result).filter(Boolean); + + // 비어있을 경우 최상위 프레임 폴백 + const any = frameResults.some(r => (r.text||'').trim().length>0); + if(!any){ + const retry = await chrome.scripting.executeScript({ + target:{ tabId: tab.id, allFrames: false }, + func: frameCollectByBlocks + }); + const rr = (retry||[]).map(r=>r.result).filter(Boolean); + frameResults.push(...rr); + } - const frameResults = (injectionResults || []) - .map(r => r.result) - .filter(Boolean); - - if (frameResults.length === 0) { - setStatus('프레임에서 수집된 데이터가 없습니다.'); + if(frameResults.length===0 || !frameResults.some(r=> (r.text||'').trim())){ + setStatus('수집된 데이터가 없습니다(폴백 실패).'); + console.debug('DEBUG(no-data):', assembleOutput(tab, frameResults)); return; } - // 파일 저장용 텍스트 - const output = assembleOutput(tab, frameResults); - - // 서버 전송용 JSON 페이로드 - const collectedAt = new Date().toISOString(); - const framesCollected = frameResults.length; - - // fullText: URL/헤더 없이 ‘텍스트만’ 병합 - const fullText = frameResults - .map(r => (r.text || '').trim()) - .filter(Boolean) - .join('\n\n'); // 프레임 구분은 빈 줄로 + console.debug(assembleOutput(tab, frameResults)); - // frames: 프레임 URL만 배열로 저장 - const frames = frameResults - .map(r => r.frameUrl) - .filter(Boolean); + // 전 프레임 통합 2차 dedupe + const mergedDedupe = mergeFramesAndDedupe(frameResults); - // 최종 페이로드 const payload = { tabUrl: tab.url || '', tabTitle: tab.title || '', - collectedAt, - framesCollected, - fullText, // 텍스트만 - frames // URL string 배열 + collectedAt: new Date().toISOString(), + framesCollected: frameResults.length, + fullText: mergedDedupe, + frames: frameResults.map(r=>r.frameUrl) }; - // ====== 서버로 저장 ====== - let apiRes = null; // <-- 바깥 스코프에 둬서 이후 사용 가능하도록 수정 - try { - apiRes = await sendToApi(payload); - console.log("Saved to Mongo:", apiRes); - } catch (e) { - console.warn("API 전송 실패:", e); - } - - // === 저장 성공 시 사이드패널 열기 === - try { - const savedId = (apiRes && typeof apiRes === 'object' && apiRes.id) ? apiRes.id : null; + let apiRes = null; + try{ apiRes = await sendToApi(payload); } + catch(e){ console.warn('API 전송 실패:', e); } - if (chrome?.sidePanel?.setOptions && chrome?.sidePanel?.open) { + try{ + const savedId = apiRes && typeof apiRes==='object' && apiRes.id ? apiRes.id : null; + if(chrome?.sidePanel?.setOptions && chrome?.sidePanel?.open){ const activeTab = await getActiveTab(); - - // 존재하지 않는 sidepanel.html 대신 실제 파일명 사용 - if (savedId) { - await chrome.sidePanel.setOptions({ - tabId: activeTab.id, - path: `sidepanel_summary.html?doc=${encodeURIComponent(savedId)}` - }); - } else { - await chrome.sidePanel.setOptions({ - tabId: activeTab.id, - path: `sidepanel_summary.html` - }); - } + await chrome.sidePanel.setOptions({ + tabId: activeTab.id, + path: savedId ? `sidepanel_summary.html?doc=${encodeURIComponent(savedId)}` : `sidepanel_summary.html` + }); await chrome.sidePanel.open({ tabId: activeTab.id }); - } else { - console.log("Side Panel API 미지원(크롬 버전 확인)."); } - } catch (e) { - console.warn("사이드패널 열기 실패:", e); - } + }catch(e){ console.warn('사이드패널 열기 실패:', e); } - } catch (err) { + setStatus('완료'); + }catch(err){ console.error(err); setStatus(`오류: ${err?.message || err}`); - } finally { + }finally{ inFlight = false; } });