/*
 * Decompiled with CFR 0.152.
 */
package com.lucidworks.connector.plugins.web.fetcher;

import com.google.inject.Inject;
import com.lucidworks.connector.plugins.web.WebConnectorException;
import com.lucidworks.connector.plugins.web.config.WebConfig;
import com.lucidworks.connector.plugins.web.fetcher.http.JsoupFilter;
import com.lucidworks.connector.plugins.web.fetcher.http.client.LinkValidator;
import com.lucidworks.connector.plugins.web.fetcher.http.client.WebClient;
import com.lucidworks.connector.plugins.web.fetcher.http.client.WebClientResponse;
import com.lucidworks.connector.plugins.web.fetcher.web.DataToEmit;
import com.lucidworks.connector.plugins.web.fetcher.web.DeleteDocument;
import com.lucidworks.connector.plugins.web.fetcher.web.ErrorDocument;
import com.lucidworks.connector.plugins.web.fetcher.web.NewDocument;
import com.lucidworks.connector.plugins.web.fetcher.web.SkipDocument;
import com.lucidworks.connector.plugins.web.util.JsoupUtil;
import com.lucidworks.fusion.connector.plugin.api.fetcher.type.content.FetchInput;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.apache.http.NameValuePair;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.xsoup.Xsoup;

public class WebContentProcessor {
    private static final Logger logger = LoggerFactory.getLogger(WebContentProcessor.class);
    public static final String SIGNATURE = "signature";
    public static final String LAST_MODIFIED = "lastModified";
    public static final String LAST_FETCHED = "lastFetched";
    public static final String FETCHED_DATE = "fetchedDate";
    public static final String ID = "id";
    private final LinkValidator linkValidator;
    private final WebConfig config;
    private final Boolean diagnosticMode;
    private final List<String> customLinkSelectors;
    private final String defaultCharSet;
    private final Boolean obeyLinkNofollow;
    private final String defaultMimeType;
    private JsoupFilter filter;
    private boolean doContentFiltering;
    private Boolean scrapeLinksBeforeFiltering;
    private List<String> fieldSelectors;
    private boolean doContentSelection;

    @Inject
    public WebContentProcessor(WebConfig config, LinkValidator linkValidator) {
        this.config = config;
        this.linkValidator = linkValidator;
        this.diagnosticMode = config.diagnosticLogging();
        this.customLinkSelectors = config.properties().documentParsingConfig().customLinkSelectors();
        this.defaultCharSet = config.properties().documentParsingConfig().defaultCharSet();
        this.defaultMimeType = config.properties().documentParsingConfig().defaultMIMEType();
        this.obeyLinkNofollow = config.properties().crawlAuthenticationConfig().obeyLinkNofollow();
        this.setupContentFiltering();
        this.setupContentSelection();
    }

    public DataToEmit process(WebClientResponse webResponse, FetchInput input) {
        int status = webResponse.getStatus();
        Map<String, Object> respMetadata = webResponse.getRespMetadata();
        boolean delete404 = this.config.properties().recrawlRulesConfig().delete404();
        if (status == 404 || status == 410) {
            if (delete404) {
                return new DeleteDocument();
            }
            return new SkipDocument();
        }
        if ((status < 200 || status > 300) && status != 304) {
            return new ErrorDocument(String.format("Non-OK HTTP status: %s", status));
        }
        if (status == 304 || respMetadata.containsKey("length_l")) {
            return new SkipDocument();
        }
        if (webResponse.getContent() == null) {
            return new ErrorDocument("No content");
        }
        InputStream content = null;
        HashMap<String, Object> fields = new HashMap<String, Object>();
        HashMap<String, Object> metadata = new HashMap<String, Object>();
        String charset = (String)respMetadata.getOrDefault("charSet", this.defaultCharSet);
        String fetchedID = (String)respMetadata.get("fetchedID");
        String newSignature = (String)respMetadata.get("etag");
        long newLastModified = (Long)respMetadata.getOrDefault(LAST_MODIFIED, 0L);
        try {
            String inputSignature = input.getMetadata().getOrDefault(SIGNATURE, "");
            long inputLastModified = input.getMetadata().getOrDefault(LAST_MODIFIED, 0L);
            String origID = input.getId();
            boolean forceRecrawl = this.config.properties().recrawlRulesConfig().forceRefresh();
            boolean clearSignatures = this.config.properties().recrawlRulesConfig().forceRefreshClearSignatures();
            if (fetchedID.equals(origID) && (!forceRecrawl && newLastModified != 0L && newLastModified == inputLastModified || !clearSignatures && !inputSignature.isEmpty() && inputSignature.equals(newSignature))) {
                return new SkipDocument();
            }
            Document document = null;
            if (webResponse.getContent() instanceof Document) {
                document = ((Document)webResponse.getContent()).clone();
                if (this.doContentFiltering) {
                    this.filter.filter(document);
                }
                if (this.doContentSelection) {
                    this.doContentSelection(document, fields);
                }
                content = new ByteArrayInputStream(document.toString().getBytes(this.getParseCharSet(charset)));
            } else if (webResponse.getContent() instanceof InputStream) {
                content = (InputStream)webResponse.getContent();
            }
            List headers = (List)respMetadata.get("headers");
            if (this.hasDirective("noindex", headers, document)) {
                return new SkipDocument();
            }
            ZonedDateTime fetchedDate = ZonedDateTime.now();
            fields.put(ID, respMetadata.get("fetchedID"));
            fields.put("mimeType", respMetadata.get("mimeType"));
            fields.put("charSet", respMetadata.get("charSet"));
            fields.put(FETCHED_DATE, fetchedDate.format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.ENGLISH)));
            fields.put("parent", input.getMetadata().get("parent"));
            fields.values().removeIf(Objects::isNull);
            metadata.put(SIGNATURE, newSignature);
            metadata.put(LAST_MODIFIED, newLastModified);
            metadata.put(LAST_FETCHED, fetchedDate.toInstant().getEpochSecond());
            metadata.values().removeIf(Objects::isNull);
            return new NewDocument(content, fields, metadata);
        }
        catch (UnsupportedEncodingException e) {
            throw new WebConnectorException("Cloud not process web data", e);
        }
    }

    public Set<String> getLinks(WebClientResponse webResponse) {
        Document document;
        if (webResponse.getContent() == null) {
            return Set.of();
        }
        if (!(webResponse.getContent() instanceof Document)) {
            return Set.of();
        }
        Map<String, Object> respMetadata = webResponse.getRespMetadata();
        List headers = (List)respMetadata.get("headers");
        if (this.hasDirective("nofollow", headers, document = ((Document)webResponse.getContent()).clone())) {
            return Set.of();
        }
        String fetchedID = (String)respMetadata.get("fetchedID");
        String charset = (String)respMetadata.getOrDefault("charSet", this.defaultCharSet);
        String mimeType = (String)respMetadata.getOrDefault("mimeType", this.defaultMimeType);
        if (!this.scrapeLinksBeforeFiltering.booleanValue()) {
            this.filter.filter(document);
        }
        return this.getLinks(fetchedID, document, this.getParseCharSet(charset), mimeType, this.obeyLinkNofollow);
    }

    private boolean hasDirective(String directive, List<Header> headers, Document content) {
        if (!this.config.properties().crawlAuthenticationConfig().obeyRobotsMeta().booleanValue()) {
            return false;
        }
        if (content != null) {
            return this.hasHeaderDirective(directive, headers) || this.hasMetaDirective(directive, content);
        }
        return this.hasHeaderDirective(directive, headers);
    }

    private boolean hasHeaderDirective(String directive, List<Header> headers) {
        if (headers == null) {
            return false;
        }
        return headers.stream().filter(h -> h.getName().equals("X-Robots-Tag")).map(NameValuePair::getValue).flatMap(this::robotsDirectivesSplit).anyMatch(d -> d.equals(directive) || d.equals("none"));
    }

    private boolean hasMetaDirective(String directive, Document content) {
        Elements elements = content.select("meta[name=robots]");
        return elements.stream().map(e -> e.attr("content")).flatMap(this::robotsDirectivesSplit).anyMatch(d -> d.equals(directive) || d.equals("none"));
    }

    private Stream<String> robotsDirectivesSplit(String robotsMeta) {
        return Arrays.stream(robotsMeta.split(",")).map(c -> c.trim().toLowerCase());
    }

    private String getParseCharSet(String charSet) {
        return Charset.isSupported(charSet) ? charSet : this.defaultCharSet;
    }

    private Set<String> getLinks(String origURL, Document doc, String charSet, String mimeType, boolean obeyLinkNofollow) {
        HashSet<String> links = new HashSet<String>();
        for (String link : this.getRawLinks(origURL, doc, mimeType, obeyLinkNofollow)) {
            this.addLinkIfValid(link, links, charSet);
        }
        return links;
    }

    private void addLinkIfValid(String link, Set<String> links, String charSet) {
        String validLink;
        if (null != link && link.trim().length() > 0 && (validLink = this.linkValidator.validate(link, charSet)) != null) {
            links.add(validLink);
        }
    }

    private Set<String> getRawLinks(String origURL, Document doc, String mimeType, boolean obeyLinkNofollow) {
        HashSet<String> links = new HashSet<String>();
        Elements aLinks = doc.select("a");
        for (Object aLink : aLinks) {
            String rel;
            String string = rel = aLink.hasAttr("rel") ? aLink.attr("rel") : "";
            if (obeyLinkNofollow && rel.trim().equalsIgnoreCase("nofollow")) {
                if (!this.diagnosticMode.booleanValue()) continue;
                logger.info("Skipping link to " + aLink.absUrl("href") + " due to rel='nofollow'");
                continue;
            }
            links.add(aLink.absUrl("href"));
        }
        Elements frameLinks = doc.select("frame");
        for (Object frameLink : frameLinks) {
            links.add(frameLink.absUrl("src"));
        }
        Elements iframeLinks = doc.select("iframe");
        for (Element iframeLink : iframeLinks) {
            links.add(iframeLink.absUrl("src"));
        }
        if (WebClient.XML_MIMETYPES.contains(mimeType)) {
            Elements rssLinks = doc.select("link");
            for (Element rssLink : rssLinks) {
                links.add(rssLink.hasAttr("href") ? rssLink.attr("href") : rssLink.text());
            }
        }
        this.addLinksFromCustomSelectors(origURL, this.customLinkSelectors, doc, links);
        return links;
    }

    private void addLinksFromCustomSelectors(String url, List<String> customLinkSelectors, Document doc, Set<String> links) {
        try {
            URI uri = new URI(url);
            URI parentUri = uri.getPath().endsWith("/") ? uri.resolve("..") : uri.resolve(".");
            Object parentUriStr = parentUri.toString();
            if (!((String)parentUriStr).endsWith("/")) {
                parentUriStr = (String)parentUriStr + "/";
            }
            for (String customLinkSelector : customLinkSelectors) {
                for (String linkTxt : Xsoup.compile((String)customLinkSelector).evaluate((Element)doc).list()) {
                    if (this.diagnosticMode.booleanValue()) {
                        logger.info("Custom link xpath selector {} for url {} result {}", new Object[]{customLinkSelector, url, linkTxt});
                    }
                    if (!StringUtils.startsWithIgnoreCase((CharSequence)linkTxt, (CharSequence)"http://") && !StringUtils.startsWithIgnoreCase((CharSequence)linkTxt, (CharSequence)"https://")) {
                        links.add((String)parentUriStr + linkTxt);
                        continue;
                    }
                    links.add(linkTxt);
                }
            }
        }
        catch (URISyntaxException e) {
            logger.error("Could not convert URL {} to a URI", (Object)url, (Object)e);
        }
    }

    private void setupContentFiltering() {
        List<String> excludedTags;
        List<String> excludedClasses;
        List<String> excludedIDs;
        List<String> excludeSelectors;
        List<String> includedTags;
        List<String> includedClasses;
        List<String> includedIDs;
        List<String> includeSelectors = this.config.properties().documentParsingConfig().includeSelectors();
        if (!includeSelectors.isEmpty()) {
            (null == this.filter ? (this.filter = new JsoupFilter()) : this.filter).addIncludedFilters(includeSelectors.toArray(new String[0]));
        }
        if (!(includedIDs = this.config.properties().documentParsingConfig().includeTagIDs()).isEmpty()) {
            (null == this.filter ? (this.filter = new JsoupFilter()) : this.filter).addIncludedTagIDs(includedIDs.toArray(new String[0]));
        }
        if ((includedClasses = this.config.properties().documentParsingConfig().includeTagClasses()).isEmpty()) {
            (null == this.filter ? (this.filter = new JsoupFilter()) : this.filter).addIncludedTagClasses(includedClasses.toArray(new String[0]));
        }
        if (null != (includedTags = this.config.properties().documentParsingConfig().includeTags())) {
            (null == this.filter ? (this.filter = new JsoupFilter()) : this.filter).addIncludedTags(includedTags.toArray(new String[0]));
        }
        if (!(excludeSelectors = this.config.properties().documentParsingConfig().excludeSelectors()).isEmpty()) {
            (null == this.filter ? (this.filter = new JsoupFilter()) : this.filter).addExcludedFilters(excludeSelectors.toArray(new String[0]));
        }
        if (!(excludedIDs = this.config.properties().documentParsingConfig().excludeTagIDs()).isEmpty()) {
            (null == this.filter ? (this.filter = new JsoupFilter()) : this.filter).addExcludedTagIDs(excludedIDs.toArray(new String[0]));
        }
        if (!(excludedClasses = this.config.properties().documentParsingConfig().excludeTagClasses()).isEmpty()) {
            (null == this.filter ? (this.filter = new JsoupFilter()) : this.filter).addExcludedTagClasses(excludedClasses.toArray(new String[0]));
        }
        if (!(excludedTags = this.config.properties().documentParsingConfig().excludeTags()).isEmpty()) {
            (null == this.filter ? (this.filter = new JsoupFilter()) : this.filter).addExcludedTags(excludedTags.toArray(new String[0]));
        }
        if (null != this.filter) {
            this.doContentFiltering = true;
            this.scrapeLinksBeforeFiltering = this.config.properties().documentParsingConfig().scrapeLinksBeforeFiltering();
            ArrayList<String> filteringRootTags = new ArrayList<String>();
            filteringRootTags.add("html");
            filteringRootTags.addAll(this.config.properties().documentParsingConfig().filteringRootTags());
            this.filter.setFilteringRootTags(filteringRootTags.toArray(new String[0]));
        }
    }

    private void setupContentSelection() {
        this.fieldSelectors = new ArrayList<String>();
        this.config.properties().documentParsingConfig().tagFields().forEach(t -> this.fieldSelectors.add(JsoupUtil.makeTag(t)));
        this.config.properties().documentParsingConfig().tagIDFields().forEach(t -> this.fieldSelectors.add(JsoupUtil.makeIDAttr(t)));
        this.config.properties().documentParsingConfig().tagClassFields().forEach(t -> this.fieldSelectors.add(JsoupUtil.makeClassAttr(t)));
        this.fieldSelectors.addAll(this.config.properties().documentParsingConfig().selectorFields());
        this.doContentSelection = !this.fieldSelectors.isEmpty();
    }

    private void doContentSelection(Document doc, Map<String, Object> fields) {
        for (String fieldSelector : this.fieldSelectors) {
            Elements fieldSelectionElements = doc.select(fieldSelector);
            if (fieldSelectionElements.isEmpty()) continue;
            ArrayList<String> fieldSelectionStrs = new ArrayList<String>(fieldSelectionElements.size());
            for (Element fieldSelectionElement : fieldSelectionElements) {
                if (fieldSelectionElement.tagName().equalsIgnoreCase("script")) {
                    fieldSelectionStrs.add(fieldSelectionElement.data());
                    continue;
                }
                fieldSelectionStrs.add(fieldSelectionElement.text());
            }
            fields.put(fieldSelector, fieldSelectionStrs);
        }
    }
}

