postiz/libraries/nestjs-libraries/src/openai/extract.content.service.ts

91 lines
2.4 KiB
TypeScript

import { Injectable } from '@nestjs/common';
import { JSDOM } from 'jsdom';
function findDepth(element: Element) {
let depth = 0;
let elementer = element;
while (elementer.parentNode) {
depth++;
// @ts-ignore
elementer = elementer.parentNode;
}
return depth;
}
@Injectable()
export class ExtractContentService {
async extractContent(url: string) {
const load = await (await fetch(url)).text();
const dom = new JSDOM(load);
// only element that has a title
const allTitles = Array.from(dom.window.document.querySelectorAll('*'))
.filter((f) => {
return (
f.querySelector('h1') ||
f.querySelector('h2') ||
f.querySelector('h3') ||
f.querySelector('h4') ||
f.querySelector('h5') ||
f.querySelector('h6')
);
})
.reverse();
const findTheOneWithMostTitles = allTitles.reduce(
(all, current) => {
const depth = findDepth(current);
const calculate = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].reduce(
(total, tag) => {
if (current.querySelector(tag)) {
return total + 1;
}
return total;
},
0
);
if (calculate > all.total) {
return { total: calculate, depth, element: current };
}
if (depth > all.depth) {
return { total: calculate, depth, element: current };
}
return all;
},
{ total: 0, depth: 0, element: null as Element | null }
);
return findTheOneWithMostTitles?.element?.textContent?.replace(/\n/g, ' ').replace(/ {2,}/g, ' ');
//
// const allElements = Array.from(
// dom.window.document.querySelectorAll('*')
// ).filter((f) => f.tagName !== 'SCRIPT');
// const findIndex = allElements.findIndex((element) => {
// return (
// ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].indexOf(
// element.tagName.toLowerCase()
// ) > -1
// );
// });
//
// if (!findIndex) {
// return false;
// }
//
// return allElements
// .slice(findIndex)
// .map((element) => element.textContent)
// .filter((f) => {
// const trim = f?.trim();
// return (trim?.length || 0) > 0 && trim !== '\n';
// })
// .map((f) => f?.trim())
// .join('')
// .replace(/\n/g, ' ')
// .replace(/ {2,}/g, ' ');
}
}