<?php

namespace App\Services\Parsers;

use Illuminate\Support\Str;
use DOMDocument;
use DOMXPath;

class HTMLParser implements ParserInterface
{
    public function parse(string $content): array
    {
        // Suppress warnings for malformed HTML
        libxml_use_internal_errors(true);
        
        $dom = new DOMDocument();
        $dom->loadHTML('<?xml encoding="UTF-8">' . $content, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
        
        libxml_clear_errors();

        $xpath = new DOMXPath($dom);

        // Extract title (first h1 or title tag)
        $title = '';
        $titleNodes = $xpath->query('//h1');
        if ($titleNodes->length > 0) {
            $title = trim($titleNodes->item(0)->textContent);
        } else {
            $titleTags = $xpath->query('//title');
            if ($titleTags->length > 0) {
                $title = trim($titleTags->item(0)->textContent);
            }
        }

        // Extract summary (first paragraph or meta description)
        $summary = '';
        $metaDesc = $xpath->query('//meta[@name="description"]/@content');
        if ($metaDesc->length > 0) {
            $summary = trim($metaDesc->item(0)->value);
        } else {
            $paragraphs = $xpath->query('//p');
            if ($paragraphs->length > 0) {
                $summary = trim($paragraphs->item(0)->textContent);
            }
        }

        // Extract pros (look for sections with "pros", "advantages", etc.)
        $pros = $this->extractListSection($xpath, ['pros', 'advantages', 'benefits']);

        // Extract cons
        $cons = $this->extractListSection($xpath, ['cons', 'disadvantages', 'drawbacks']);

        // Extract features
        $features = $this->extractListSection($xpath, ['features', 'key features']);

        // Extract FAQs
        $faqs = $this->extractFAQs($xpath);

        // Extract affiliate link (look for links with "buy", "affiliate", etc.)
        $affiliateLink = '';
        $links = $xpath->query('//a[@href]');
        foreach ($links as $link) {
            $href = $link->getAttribute('href');
            $text = strtolower($link->textContent);
            if (stripos($text, 'buy') !== false || stripos($text, 'affiliate') !== false) {
                $affiliateLink = $href;
                break;
            }
        }

        // Extract CTA
        $cta = '';
        $ctaElements = $xpath->query('//*[contains(@class, "cta") or contains(@id, "cta")]');
        if ($ctaElements->length > 0) {
            $cta = trim($ctaElements->item(0)->textContent);
        }

        // Get clean HTML content
        $body = $dom->getElementsByTagName('body');
        $contentHtml = '';
        if ($body->length > 0) {
            $contentHtml = $this->getInnerHTML($body->item(0));
        } else {
            $contentHtml = $content;
        }

        // Extract meta tags for SEO
        $metaTitle = $title;
        $titleTags = $xpath->query('//meta[@property="og:title"]/@content | //meta[@name="og:title"]/@content');
        if ($titleTags->length > 0) {
            $metaTitle = trim($titleTags->item(0)->value);
        }

        $metaDescription = $summary;
        $metaDescTags = $xpath->query('//meta[@property="og:description"]/@content | //meta[@name="og:description"]/@content');
        if ($metaDescTags->length > 0) {
            $metaDescription = trim($metaDescTags->item(0)->value);
        }

        return [
            'title' => $title,
            'slug' => $title ? Str::slug($title) : '',
            'summary' => $summary,
            'content_html' => $contentHtml,
            'content_markdown' => '',
            'affiliate_link' => $affiliateLink,
            'pros' => $pros,
            'cons' => $cons,
            'features' => $features,
            'faqs' => $faqs,
            'tables' => [],
            'cta' => $cta,
            'seo' => [
                'meta_title' => $metaTitle,
                'meta_description' => $metaDescription,
                'keywords' => [],
            ],
        ];
    }

    protected function extractListSection(DOMXPath $xpath, array $keywords): array
    {
        $items = [];

        // Look for sections with keywords in heading or class/id
        foreach ($keywords as $keyword) {
            $sections = $xpath->query("//*[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{$keyword}')]");
            
            foreach ($sections as $section) {
                // Find lists within this section
                $lists = $xpath->query('.//ul | .//ol', $section);
                foreach ($lists as $list) {
                    $listItems = $xpath->query('.//li', $list);
                    foreach ($listItems as $item) {
                        $text = trim($item->textContent);
                        if (!empty($text)) {
                            $items[] = $text;
                        }
                    }
                }
            }
        }

        return array_unique($items);
    }

    protected function extractFAQs(DOMXPath $xpath): array
    {
        $faqs = [];

        // Look for FAQ sections
        $faqSections = $xpath->query("//*[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'faq') or contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'question')]");

        foreach ($faqSections as $section) {
            $questions = $xpath->query('.//h2 | .//h3 | .//dt', $section);
            $answers = $xpath->query('.//p | .//dd', $section);

            foreach ($questions as $index => $question) {
                $q = trim($question->textContent);
                $a = '';
                
                if (isset($answers[$index])) {
                    $a = trim($answers[$index]->textContent);
                }

                if (!empty($q)) {
                    $faqs[] = [
                        'question' => $q,
                        'answer' => $a,
                    ];
                }
            }
        }

        return $faqs;
    }

    protected function getInnerHTML($node): string
    {
        $innerHTML = '';
        $children = $node->childNodes;
        
        foreach ($children as $child) {
            $innerHTML .= $node->ownerDocument->saveHTML($child);
        }
        
        return $innerHTML;
    }
}

