diff --git a/package-lock.json b/package-lock.json index 8df4aef..1f71840 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,12 +11,14 @@ "dependencies": { "cheerio": "^1.0.0", "express": "^4.21.1", + "multer": "^1.4.5-lts.1", "nodemon": "^3.1.7" }, "devDependencies": { "@types/cheerio": "^0.22.35", "@types/express": "^5.0.0", "@types/jest": "^29.5.13", + "@types/multer": "^1.4.12", "jest": "^29.7.0", "ts-jest": "^29.2.5", "ts-node": "^10.9.2" @@ -1248,6 +1250,15 @@ "integrity": "sha512-/pyBZWSLD2n0dcHE3hq8s8ZvcETHtEuF+3E7XVt0Ig2nvsVQXdghHVcEkIWjy9A0wKfTn97a/PSDYohKIlnP/w==", "dev": true }, + "node_modules/@types/multer": { + "version": "1.4.12", + "resolved": "https://registry.npmjs.org/@types/multer/-/multer-1.4.12.tgz", + "integrity": "sha512-pQ2hoqvXiJt2FP9WQVLPRO+AmiIm/ZYkavPlIQnx282u4ZrVdztx0pkh3jjpQt0Kz+YI0YhSG264y08UJKoUQg==", + "dev": true, + "dependencies": { + "@types/express": "*" + } + }, "node_modules/@types/node": { "version": "22.7.5", "resolved": "https://registry.npmjs.org/@types/node/-/node-22.7.5.tgz", @@ -1398,6 +1409,11 @@ "node": ">= 8" } }, + "node_modules/append-field": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/append-field/-/append-field-1.0.0.tgz", + "integrity": "sha512-klpgFSWLW1ZEs8svjfb7g4qWY0YS5imI82dTg+QahUvJ8YqAY0P10Uk8tTyh9ZGuYEZEMaeJYCF5BFuX552hsw==" + }, "node_modules/arg": { "version": "4.1.3", "resolved": "https://registry.npmjs.org/arg/-/arg-4.1.3.tgz", @@ -1663,8 +1679,18 @@ "node_modules/buffer-from": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz", - "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==", - "dev": true + "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==" + }, + "node_modules/busboy": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz", + "integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==", + "dependencies": { + "streamsearch": "^1.1.0" + }, + "engines": { + "node": ">=10.16.0" + } }, "node_modules/bytes": { "version": "3.1.2", @@ -1913,6 +1939,20 @@ "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==" }, + "node_modules/concat-stream": { + "version": "1.6.2", + "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", + "integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==", + "engines": [ + "node >= 0.8" + ], + "dependencies": { + "buffer-from": "^1.0.0", + "inherits": "^2.0.3", + "readable-stream": "^2.2.2", + "typedarray": "^0.0.6" + } + }, "node_modules/content-disposition": { "version": "0.5.4", "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz", @@ -1951,6 +1991,11 @@ "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz", "integrity": "sha512-QADzlaHc8icV8I7vbaJXJwod9HWYp8uCqf1xa4OfNu1T7JVxQIrUgOWtHdNDtPiywmFbiS12VjotIXLrKM3orQ==" }, + "node_modules/core-util-is": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", + "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==" + }, "node_modules/create-jest": { "version": "29.7.0", "resolved": "https://registry.npmjs.org/create-jest/-/create-jest-29.7.0.tgz", @@ -2908,6 +2953,11 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==" + }, "node_modules/isexe": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", @@ -3829,11 +3879,47 @@ "node": "*" } }, + "node_modules/minimist": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", + "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/mkdirp": { + "version": "0.5.6", + "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.6.tgz", + "integrity": "sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==", + "dependencies": { + "minimist": "^1.2.6" + }, + "bin": { + "mkdirp": "bin/cmd.js" + } + }, "node_modules/ms": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==" }, + "node_modules/multer": { + "version": "1.4.5-lts.1", + "resolved": "https://registry.npmjs.org/multer/-/multer-1.4.5-lts.1.tgz", + "integrity": "sha512-ywPWvcDMeH+z9gQq5qYHCCy+ethsk4goepZ45GLD63fOu0YcNecQxi64nDs3qluZB+murG3/D4dJ7+dGctcCQQ==", + "dependencies": { + "append-field": "^1.0.0", + "busboy": "^1.0.0", + "concat-stream": "^1.5.2", + "mkdirp": "^0.5.4", + "object-assign": "^4.1.1", + "type-is": "^1.6.4", + "xtend": "^4.0.0" + }, + "engines": { + "node": ">= 6.0.0" + } + }, "node_modules/natural-compare": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz", @@ -3939,6 +4025,14 @@ "url": "https://github.com/fb55/nth-check?sponsor=1" } }, + "node_modules/object-assign": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/object-inspect": { "version": "1.13.2", "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.2.tgz", @@ -4198,6 +4292,11 @@ "url": "https://github.com/chalk/ansi-styles?sponsor=1" } }, + "node_modules/process-nextick-args": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", + "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==" + }, "node_modules/prompts": { "version": "2.4.2", "resolved": "https://registry.npmjs.org/prompts/-/prompts-2.4.2.tgz", @@ -4286,6 +4385,25 @@ "integrity": "sha512-/LLMVyas0ljjAtoYiPqYiL8VWXzUUdThrmU5+n20DZv+a+ClRoevUzw5JxU+Ieh5/c87ytoTBV9G1FiKfNJdmg==", "dev": true }, + "node_modules/readable-stream": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", + "integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==", + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "node_modules/readable-stream/node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" + }, "node_modules/readdirp": { "version": "3.6.0", "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz", @@ -4574,6 +4692,27 @@ "node": ">= 0.8" } }, + "node_modules/streamsearch": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz", + "integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==", + "engines": { + "node": ">=10.0.0" + } + }, + "node_modules/string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", + "dependencies": { + "safe-buffer": "~5.1.0" + } + }, + "node_modules/string_decoder/node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" + }, "node_modules/string-length": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/string-length/-/string-length-4.0.2.tgz", @@ -4846,6 +4985,11 @@ "node": ">= 0.6" } }, + "node_modules/typedarray": { + "version": "0.0.6", + "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", + "integrity": "sha512-/aCDEGatGvZ2BIk+HmLf4ifCJFwvKFNb9/JeZPMulfgFracn9QFcAf5GO8B/mweUjSoblS5In0cWhqpfs/5PQA==" + }, "node_modules/typescript": { "version": "5.6.3", "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.6.3.tgz", @@ -4917,6 +5061,11 @@ "browserslist": ">= 4.21.0" } }, + "node_modules/util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==" + }, "node_modules/utils-merge": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", @@ -5053,6 +5202,14 @@ "node": "^12.13.0 || ^14.15.0 || >=16.0.0" } }, + "node_modules/xtend": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", + "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==", + "engines": { + "node": ">=0.4" + } + }, "node_modules/y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", diff --git a/package.json b/package.json index f478241..94412aa 100644 --- a/package.json +++ b/package.json @@ -15,12 +15,14 @@ "dependencies": { "cheerio": "^1.0.0", "express": "^4.21.1", + "multer": "^1.4.5-lts.1", "nodemon": "^3.1.7" }, "devDependencies": { "@types/cheerio": "^0.22.35", "@types/express": "^5.0.0", "@types/jest": "^29.5.13", + "@types/multer": "^1.4.12", "jest": "^29.7.0", "ts-jest": "^29.2.5", "ts-node": "^10.9.2" diff --git a/src/controllers/processHtmlController.ts b/src/controllers/processHtmlController.ts index a14f0a1..9bfe6e3 100644 --- a/src/controllers/processHtmlController.ts +++ b/src/controllers/processHtmlController.ts @@ -1,6 +1,5 @@ import { NextFunction, Request, Response } from "express"; import { ProcessHtmlService } from "../services/processHtmlService"; -import path from "path"; import { CssCode } from "../types"; const service = new ProcessHtmlService(); @@ -10,10 +9,11 @@ export async function processHtmlInputController( res: Response, next: NextFunction, ) { - const cssCode = req.body as CssCode; - const htmlCode = path.resolve(__dirname, path.join("../data/index.html")); + console.log(JSON.parse(req.body.data)); + const cssCode = JSON.parse(req.body.data) as CssCode; try { - const response = await service.extractData(htmlCode, cssCode); + const htmlContent = req.file!.buffer.toString(); + const response = await service.extractData(htmlContent, cssCode); res.json(response); } catch (err) { next(err); diff --git a/src/middlewares/validateFile.ts b/src/middlewares/validateFile.ts new file mode 100644 index 0000000..55ce05a --- /dev/null +++ b/src/middlewares/validateFile.ts @@ -0,0 +1,17 @@ +export const validateFile = ( + req: Express.Request, + file: Express.Multer.File, + cb: Function, +) => { + const filetypes = /html|htm/; + const mimetype = filetypes.test(file.mimetype); + const extname = filetypes.test( + file.originalname.split(".").pop()!.toLowerCase(), + ); + + if (mimetype && extname) { + return cb(null, true); + } else { + return cb(new Error("Only HTML files are allowed!"), false); + } +}; diff --git a/src/routes/processHtmlRouter.ts b/src/routes/processHtmlRouter.ts index 3d8406d..419b0f2 100644 --- a/src/routes/processHtmlRouter.ts +++ b/src/routes/processHtmlRouter.ts @@ -1,8 +1,17 @@ import { Router } from "express"; import { processHtmlInputController } from "../controllers/processHtmlController"; +import { validateFile } from "../middlewares/validateFile"; + +import multer from "multer"; const r = Router(); -r.post("/extract", processHtmlInputController); +const storage = multer.memoryStorage(); +const upload = multer({ + storage, + fileFilter: validateFile, +}); + +r.post("/extract", upload.single("htmlFile"), processHtmlInputController); export default r; diff --git a/src/services/processHtmlService.test.ts b/src/services/processHtmlService.test.ts index d05a1ce..e3b10d4 100644 --- a/src/services/processHtmlService.test.ts +++ b/src/services/processHtmlService.test.ts @@ -1,5 +1,6 @@ import path from "path"; import { ProcessHtmlService } from "../services/processHtmlService"; +import {readFile} from "fs/promises"; describe("Process html api", () => { test("Extract html: Example 1", async () => { @@ -13,7 +14,8 @@ describe("Process html api", () => { __dirname, path.join("../data/index.html"), ); - const r = await service.extractData(htmlCode, cssCode); + const html = await readFile(htmlCode, "utf-8"); + const r = await service.extractData(html, cssCode); expect(r).toBeTruthy(); expect(r).toEqual({ title: "This is the title of the page", @@ -34,7 +36,8 @@ describe("Process html api", () => { __dirname, path.join("../data/index.html"), ); - const r = await service.extractData(htmlCode, cssCode); + const html = await readFile(htmlCode, "utf-8"); + const r = await service.extractData(html, cssCode); expect(r).toBeTruthy(); expect(r).toEqual({ firstParagraph: @@ -65,7 +68,8 @@ describe("Process html api", () => { __dirname, path.join("../data/index.html"), ); - const r = await service.extractData(htmlCode, cssCode); + const html = await readFile(htmlCode, "utf-8"); + const r = await service.extractData(html, cssCode); expect(r).toBeTruthy(); expect(r).toEqual({ title: "This is the title of the page", diff --git a/src/services/processHtmlService.ts b/src/services/processHtmlService.ts index fc8649e..46001fb 100644 --- a/src/services/processHtmlService.ts +++ b/src/services/processHtmlService.ts @@ -1,14 +1,11 @@ -import { readFile } from "fs/promises"; import * as cheerio from "cheerio"; import { CssCode, ExtractResult } from "../types"; export class ProcessHtmlService { async extractData(htmlCode: string, cssCode: CssCode): Promise { try { - const html = await readFile(htmlCode, "utf-8"); const result: ExtractResult = {}; - - const $ = cheerio.load(html); + const $ = cheerio.load(htmlCode); for (const [key, selector] of Object.entries(cssCode)) { if (typeof selector === "string") { result[key] = this.extractDirectElements($, selector); @@ -19,6 +16,7 @@ export class ProcessHtmlService { return result; } catch (err) { + console.log(err); throw new Error("Error occurred while extracting data."); } }