Added optical character recognition.
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,2 +1,2 @@
|
|||||||
|
*.swp
|
||||||
node_modules
|
node_modules
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ export const options = {
|
|||||||
|
|
||||||
let scenarios = {
|
let scenarios = {
|
||||||
base:{
|
base:{
|
||||||
executor "per-vu-iterations",
|
executor: "per-vu-iterations",
|
||||||
vus: 1,
|
vus: 1,
|
||||||
iterations: 1,
|
iterations: 1,
|
||||||
maxDuration: "5m"
|
maxDuration: "5m"
|
||||||
|
|||||||
BIN
Tests/SampleHTML/amazon.html
Normal file
BIN
Tests/SampleHTML/amazon.html
Normal file
Binary file not shown.
@@ -4,6 +4,8 @@ import { SharedArray } from "k6/data"
|
|||||||
import http from "k6/http"
|
import http from "k6/http"
|
||||||
import { URL } from 'https://jslib.k6.io/url/1.0.0/index.js';
|
import { URL } from 'https://jslib.k6.io/url/1.0.0/index.js';
|
||||||
|
|
||||||
|
import { scrapeDataMod } from "./scrapeDataModule.js";
|
||||||
|
|
||||||
export const options = {
|
export const options = {
|
||||||
scenarios:{},
|
scenarios:{},
|
||||||
teardownTimeout: "120s",
|
teardownTimeout: "120s",
|
||||||
@@ -20,33 +22,7 @@ let scenarios = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const baseURL = "http://localhost:8001";
|
const baseURL = "http://localhost:8001";
|
||||||
let scrapeData = [
|
let scrapeData = scrapeDataMod;
|
||||||
{
|
|
||||||
url: "https://www.ebay.com/itm/154843103473?_trkparms=amclksrc%3DITM%26aid%3D777008%26algo%3DPERSONAL.TOPIC%26ao%3D1%26asc%3D20230823115209%26meid%3D547ade272f0245a3a38d3f775c940b40%26pid%3D101800%26rk%3D1%26rkt%3D1%26sd%3D394822890601%26itm%3D154843103473%26pmt%3D1%26noa%3D1%26pg%3D4375194%26algv%3DRecentlyViewedItemsV2SignedOut%26brand%3DApple&_trksid=p4375194.c101800.m5481&_trkparms=parentrq%3Ad7d5ca4718a0ab4c13690428fffff6d6%7Cpageci%3A22c4128a-5d61-11ee-9d02-ee1c8ae0bfdf%7Ciid%3A1%7Cvlpname%3Avlp_homepage",
|
|
||||||
prefix:"US ",
|
|
||||||
idName: ".ux-textspans",
|
|
||||||
instanceCount: 36,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
url: "https://www.ebay.com/itm/254429385780?_trkparms=amclksrc%3DITM%26aid%3D777008%26algo%3DPERSONAL.TOPIC%26ao%3D1%26asc%3D20230823115209%26meid%3D547ade272f0245a3a38d3f775c940b40%26pid%3D101800%26rk%3D1%26rkt%3D1%26sd%3D394822890601%26itm%3D254429385780%26pmt%3D1%26noa%3D1%26pg%3D4375194%26algv%3DRecentlyViewedItemsV2SignedOut%26brand%3DApple&_trksid=p4375194.c101800.m5481&_trkparms=parentrq%3Ad7d5ca4718a0ab4c13690428fffff6d6%7Cpageci%3A22c4128a-5d61-11ee-9d02-ee1c8ae0bfdf%7Ciid%3A1%7Cvlpname%3Avlp_homepage",
|
|
||||||
prefix: "US ",
|
|
||||||
idName: ".ux-textspans",
|
|
||||||
instanceCount: 39
|
|
||||||
},
|
|
||||||
{
|
|
||||||
url:"https://www.ebay.com/itm/125187190152?_trkparms=amclksrc%3DITM%26aid%3D777008%26algo%3DPERSONAL.TOPIC%26ao%3D1%26asc%3D20230823115%20%20%20%20209%26meid%3D547ade272f0245a3a38d3f775c940b40%26pid%3D101800%26rk%3D1%26rkt%3D1%26sd%3D394822890601%26itm%3D125187190152%26pmt%3D0%26noa%%20%20%20%203D1%26pg%3D4375194%26algv%3DRecentlyViewedItemsV2SignedOut%26brand%3DMiller&_trksid=p4375194.c101800.m5481&_trkparms=parentrq%3Ad7d5ca471%20%20%20%208a0ab4c13690428fffff6d6%7Cpageci%3A22c4128a-5d61-11ee-9d02-ee1c8ae0bfdf%7Ciid%3A1%7Cvlpname%3Avlp_homepage",
|
|
||||||
prefix: "US ",
|
|
||||||
idName: ".ux-textspans",
|
|
||||||
instanceCount: 14
|
|
||||||
},
|
|
||||||
{
|
|
||||||
url: "https://www.ebay.com/itm/355014752155?_trkparms=amclksrc%3DITM%26aid%3D1110006%26algo%3DHOMESPLICE.SIM%26ao%3D1%26asc%3D20201210111314%26meid%3Db9d7bfc448e846fd88d6af6196122543%26pid%3D101195%26rk%3D5%26rkt%3D12%26sd%3D125187190152%26itm%3D355014752155%26pmt%3D1%26noa%3D0%26pg%3D4429486%26algv%3DSimplAMLv11WebTrimmedV3MskuWithLambda85KnnRecallV1V2V4ItemNrtInQueryAndCassiniVisualRankerAndBertRecallWithVMEV3CPCAuto%26brand%3DMiller&_trksid=p4429486.c101195.m1851&amdata=cksum%3A355014752155b9d7bfc448e846fd88d6af6196122543%7Cenc%3AAQAIAAABUObhgc4Nk8%252BdtAwOww4FKLaj%252FQ5qqgDlQCuqZA43WcPFUWDERCUugbbOk7XQv0JXlBfqCg2xKF3WcPghxGMFw2oSlXvfExEaMYr7I7LmrHcP6czY1wIMt0ORyKiCWt95xldincyyBx3g%252BNDW%252B%252FhWUgTaBhK6xAm%252BJIbCOMehu%252Bdw7Cl7%252B5IYh7smXk3oe11K772Gk2jRH3EKtZgP6B%252FlgnbOdlzXvdfx9nm%252BOFv14nym91rSP%252Fp0wbIOb9ayjgcJ%252BFrPBZFmP28lX44UnMF2tb1luPAriUk40GUO3lqhKbBiRBHaRdiQQMcQYqGH0PMIMw9ARpndx%252BhzDgl11zXK577uYvKJmCTZG%252BJsYG0kBH8jTJWhtdTz3Z7HEvndOTAx0XNofblr0%252FSfGh1VnTJs5jXxD1%252Fn86pkxTf7HyqpXKsaDdR64EbDneXYdEMMx2UixQ%253D%253D%7Campid%3APL_CLK%7Cclp%3A4429486&epid=722188521",
|
|
||||||
prefix: "US ",
|
|
||||||
idName: ".ux-textspans",
|
|
||||||
instanceCount: 14
|
|
||||||
}
|
|
||||||
|
|
||||||
]
|
|
||||||
|
|
||||||
let ENV = {};
|
let ENV = {};
|
||||||
if(__ENV.scenario){
|
if(__ENV.scenario){
|
||||||
|
|||||||
32
Tests/scrapeDataModule.js
Normal file
32
Tests/scrapeDataModule.js
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
export const scrapeDataMod = [
|
||||||
|
{
|
||||||
|
url: "https://www.ebay.com/itm/154843103473?_trkparms=amclksrc%3DITM%26aid%3D777008%26algo%3DPERSONAL.TOPIC%26ao%3D1%26asc%3D20230823115209%26meid%3D547ade272f0245a3a38d3f775c940b40%26pid%3D101800%26rk%3D1%26rkt%3D1%26sd%3D394822890601%26itm%3D154843103473%26pmt%3D1%26noa%3D1%26pg%3D4375194%26algv%3DRecentlyViewedItemsV2SignedOut%26brand%3DApple&_trksid=p4375194.c101800.m5481&_trkparms=parentrq%3Ad7d5ca4718a0ab4c13690428fffff6d6%7Cpageci%3A22c4128a-5d61-11ee-9d02-ee1c8ae0bfdf%7Ciid%3A1%7Cvlpname%3Avlp_homepage",
|
||||||
|
prefix: "US ",
|
||||||
|
idName: ".ux-textspans",
|
||||||
|
instanceCount: 36,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
url: "https://www.ebay.com/itm/254429385780?_trkparms=amclksrc%3DITM%26aid%3D777008%26algo%3DPERSONAL.TOPIC%26ao%3D1%26asc%3D20230823115209%26meid%3D547ade272f0245a3a38d3f775c940b40%26pid%3D101800%26rk%3D1%26rkt%3D1%26sd%3D394822890601%26itm%3D254429385780%26pmt%3D1%26noa%3D1%26pg%3D4375194%26algv%3DRecentlyViewedItemsV2SignedOut%26brand%3DApple&_trksid=p4375194.c101800.m5481&_trkparms=parentrq%3Ad7d5ca4718a0ab4c13690428fffff6d6%7Cpageci%3A22c4128a-5d61-11ee-9d02-ee1c8ae0bfdf%7Ciid%3A1%7Cvlpname%3Avlp_homepage",
|
||||||
|
prefix: "US ",
|
||||||
|
idName: ".ux-textspans",
|
||||||
|
instanceCount: 39,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
url: "https://www.ebay.com/itm/125187190152?_trkparms=amclksrc%3DITM%26aid%3D777008%26algo%3DPERSONAL.TOPIC%26ao%3D1%26asc%3D20230823115%20%20%20%20209%26meid%3D547ade272f0245a3a38d3f775c940b40%26pid%3D101800%26rk%3D1%26rkt%3D1%26sd%3D394822890601%26itm%3D125187190152%26pmt%3D0%26noa%%20%20%20%203D1%26pg%3D4375194%26algv%3DRecentlyViewedItemsV2SignedOut%26brand%3DMiller&_trksid=p4375194.c101800.m5481&_trkparms=parentrq%3Ad7d5ca471%20%20%20%208a0ab4c13690428fffff6d6%7Cpageci%3A22c4128a-5d61-11ee-9d02-ee1c8ae0bfdf%7Ciid%3A1%7Cvlpname%3Avlp_homepage",
|
||||||
|
prefix: "US ",
|
||||||
|
idName: ".ux-textspans",
|
||||||
|
instanceCount: 14,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
url: "https://www.ebay.com/itm/355014752155?_trkparms=amclksrc%3DITM%26aid%3D1110006%26algo%3DHOMESPLICE.SIM%26ao%3D1%26asc%3D20201210111314%26meid%3Db9d7bfc448e846fd88d6af6196122543%26pid%3D101195%26rk%3D5%26rkt%3D12%26sd%3D125187190152%26itm%3D355014752155%26pmt%3D1%26noa%3D0%26pg%3D4429486%26algv%3DSimplAMLv11WebTrimmedV3MskuWithLambda85KnnRecallV1V2V4ItemNrtInQueryAndCassiniVisualRankerAndBertRecallWithVMEV3CPCAuto%26brand%3DMiller&_trksid=p4429486.c101195.m1851&amdata=cksum%3A355014752155b9d7bfc448e846fd88d6af6196122543%7Cenc%3AAQAIAAABUObhgc4Nk8%252BdtAwOww4FKLaj%252FQ5qqgDlQCuqZA43WcPFUWDERCUugbbOk7XQv0JXlBfqCg2xKF3WcPghxGMFw2oSlXvfExEaMYr7I7LmrHcP6czY1wIMt0ORyKiCWt95xldincyyBx3g%252BNDW%252B%252FhWUgTaBhK6xAm%252BJIbCOMehu%252Bdw7Cl7%252B5IYh7smXk3oe11K772Gk2jRH3EKtZgP6B%252FlgnbOdlzXvdfx9nm%252BOFv14nym91rSP%252Fp0wbIOb9ayjgcJ%252BFrPBZFmP28lX44UnMF2tb1luPAriUk40GUO3lqhKbBiRBHaRdiQQMcQYqGH0PMIMw9ARpndx%252BhzDgl11zXK577uYvKJmCTZG%252BJsYG0kBH8jTJWhtdTz3Z7HEvndOTAx0XNofblr0%252FSfGh1VnTJs5jXxD1%252Fn86pkxTf7HyqpXKsaDdR64EbDneXYdEMMx2UixQ%253D%253D%7Campid%3APL_CLK%7Cclp%3A4429486&epid=722188521",
|
||||||
|
prefix: "US ",
|
||||||
|
idName: ".ux-textspans",
|
||||||
|
instanceCount: 14,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
url: "https://www.amazon.com/Schiit-Hybrid-Headphone-Preamp-Black/dp/B0BC6P1WF6/?_encoding=UTF8&pd_rd_w=eJxP2&content-id=amzn1.sym.455a8027-f810-447e-9547-49291aea9c0f&pf_rd_p=455a8027-f810-447e-9547-49291aea9c0f&pf_rd_r=6YB6M8XWKWQ25NWQCXWB&pd_rd_wg=wqc1f&pd_rd_r=21224218-cc39-4619-93db-85797ee56eac&ref_=pd_gw_bmx_gp_rc4cdj8o",
|
||||||
|
prefix: "",
|
||||||
|
idName: ".a-offscreen",
|
||||||
|
instanceCount: 0,
|
||||||
|
},
|
||||||
|
];
|
||||||
BIN
eng.traineddata
Normal file
BIN
eng.traineddata
Normal file
Binary file not shown.
2357
package-lock.json
generated
2357
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -7,7 +7,10 @@
|
|||||||
"mongo-sanitize": "^1.1.0",
|
"mongo-sanitize": "^1.1.0",
|
||||||
"mongodb": "^6.1.0",
|
"mongodb": "^6.1.0",
|
||||||
"node-cron": "^3.0.2",
|
"node-cron": "^3.0.2",
|
||||||
|
"node-tesseract-ocr": "^2.2.1",
|
||||||
"pug": "^3.0.2",
|
"pug": "^3.0.2",
|
||||||
"request": "^2.88.2"
|
"puppeteer": "^21.3.6",
|
||||||
|
"request": "^2.88.2",
|
||||||
|
"tesseract.js": "^5.0.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
BIN
savedImages/amazonTest.png
Normal file
BIN
savedImages/amazonTest.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.1 MiB |
BIN
savedImages/test.png
Normal file
BIN
savedImages/test.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 70 KiB |
133
server.js
133
server.js
@@ -15,6 +15,8 @@ var https = require('https');
|
|||||||
var MongoClient = require('mongodb').MongoClient;
|
var MongoClient = require('mongodb').MongoClient;
|
||||||
var sanitize = require('mongo-sanitize');
|
var sanitize = require('mongo-sanitize');
|
||||||
var cheerio = require("cheerio"); //jQuery Substitute
|
var cheerio = require("cheerio"); //jQuery Substitute
|
||||||
|
var puppeteer = require("puppeteer");
|
||||||
|
const { createWorker } = require("tesseract.js");
|
||||||
|
|
||||||
|
|
||||||
const pug = require('pug');
|
const pug = require('pug');
|
||||||
@@ -26,11 +28,109 @@ app.set('view engine', "pug")
|
|||||||
app.use("/images", express.static(path.join(__dirname, '/images')));
|
app.use("/images", express.static(path.join(__dirname, '/images')));
|
||||||
app.use("/static", express.static(path.join(__dirname, "/static")));
|
app.use("/static", express.static(path.join(__dirname, "/static")));
|
||||||
|
|
||||||
|
const baseFilePath = "./savedImages";
|
||||||
|
|
||||||
app.get('/', function (req, res) {
|
app.get('/', function (req, res) {
|
||||||
res.send('Hello World');
|
res.send('Hello World');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
app.get("/takeScreenshot", async function(req, res){
|
||||||
|
let url = req.query["url"];
|
||||||
|
let name = req.query["name"];
|
||||||
|
|
||||||
|
puppeteer.launch({
|
||||||
|
headless: "new",
|
||||||
|
defaultViewport: {
|
||||||
|
width: 1280,
|
||||||
|
height: 2000,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
.then(async (browser) => {
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.goto(url);
|
||||||
|
await page.screenshot({ path: `${baseFilePath}/${name}.png` });
|
||||||
|
await browser.close();
|
||||||
|
});
|
||||||
|
res.send("204 no content");
|
||||||
|
res.end();
|
||||||
|
});
|
||||||
|
|
||||||
|
app.get("/getValueFromImage", async function(req, res){
|
||||||
|
console.log("Pulling info from image");
|
||||||
|
let name = req.query["name"];
|
||||||
|
let top = req.query["top"]; //374
|
||||||
|
let left = req.query["left"]; //596
|
||||||
|
let width = req.query["width"];//75
|
||||||
|
let height = req.query["height"];//30
|
||||||
|
|
||||||
|
const img = fs.readFileSync(`${baseFilePath}/${name}.png`);
|
||||||
|
|
||||||
|
const worker = await createWorker("eng");
|
||||||
|
const rectangle = { left: left, top: top, width: width, height: height };
|
||||||
|
let myText = "Default";
|
||||||
|
(async () => {
|
||||||
|
const {
|
||||||
|
data: { text },
|
||||||
|
} = await worker.recognize(img,
|
||||||
|
{ rectangle }
|
||||||
|
);
|
||||||
|
res.send(text);
|
||||||
|
res.end();
|
||||||
|
console.log(text);
|
||||||
|
await worker.terminate();
|
||||||
|
})();
|
||||||
|
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
app.get("/getVisualPrice", async function(req, res){
|
||||||
|
console.log("Getting Visual Price");
|
||||||
|
let url = req.query["url"];
|
||||||
|
let name = req.query["name"];
|
||||||
|
let top = req.query["top"]; //374
|
||||||
|
let left = req.query["left"]; //596
|
||||||
|
let width = req.query["width"];//75
|
||||||
|
let height = req.query["height"];//30
|
||||||
|
|
||||||
|
puppeteer.launch({
|
||||||
|
headless: "new",
|
||||||
|
defaultViewport: {
|
||||||
|
width: 1280,
|
||||||
|
height: 2000,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
.then(async (browser) => {
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.goto(url);
|
||||||
|
await page.screenshot({ path: `${baseFilePath}/${name}.png` });
|
||||||
|
await browser.close();
|
||||||
|
|
||||||
|
console.log("Reading Image");
|
||||||
|
const img = fs.readFileSync(`${baseFilePath}/${name}.png`);
|
||||||
|
|
||||||
|
const worker = await createWorker("eng");
|
||||||
|
const rectangle = {
|
||||||
|
left: left,
|
||||||
|
top: top,
|
||||||
|
width: width,
|
||||||
|
height: height,
|
||||||
|
};
|
||||||
|
let myText = "Default";
|
||||||
|
(async () => {
|
||||||
|
console.log("Finding text");
|
||||||
|
const {
|
||||||
|
data: { text },
|
||||||
|
} = await worker.recognize(img, { rectangle });
|
||||||
|
res.send(text);
|
||||||
|
res.end();
|
||||||
|
console.log(text);
|
||||||
|
await worker.terminate();
|
||||||
|
})();
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
app.get("/getPrice", async function(req, res){
|
app.get("/getPrice", async function(req, res){
|
||||||
try{
|
try{
|
||||||
console.log(`Getting Price!`);
|
console.log(`Getting Price!`);
|
||||||
@@ -39,24 +139,39 @@ app.get("/getPrice", async function(req, res){
|
|||||||
let prefix = req.query["prefix"];
|
let prefix = req.query["prefix"];
|
||||||
let instanceCount = parseInt(req.query["instanceCount"]);
|
let instanceCount = parseInt(req.query["instanceCount"]);
|
||||||
let idName = req.query["idName"];
|
let idName = req.query["idName"];
|
||||||
let finalAnswer = "";
|
|
||||||
|
|
||||||
console.log(`Get Price Info:\n
|
console.log(`Get Price Info:\n
|
||||||
Prefix: ${prefix}\n
|
Prefix: ${prefix}\n
|
||||||
InstanceCount: ${instanceCount}\n
|
InstanceCount: ${instanceCount}\n
|
||||||
idName: ${idName}\n
|
idName: ${idName}\n
|
||||||
URL: ${url}\n
|
URL: ${url}\n
|
||||||
`);
|
`);
|
||||||
|
|
||||||
request(url, (err, resp, html) => {
|
request(url, (err, resp, html) => {
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
finalAnswer = "";
|
console.log(html);
|
||||||
$(idName).each((i, el) => {
|
let finalAnswer = "";
|
||||||
if(i == instanceCount){
|
if(instanceCount == 0){
|
||||||
const item = $(el).text();
|
console.log("Getting single instance");
|
||||||
finalAnswer = item.split(prefix).reverse()[0];
|
let item = {};
|
||||||
}
|
console.log($(idName));
|
||||||
});
|
|
||||||
|
$(idName).each((i, el) => {
|
||||||
|
//console.log("item");
|
||||||
|
//console.log(el);
|
||||||
|
item = $(el).text();
|
||||||
|
});
|
||||||
|
console.log(JSON.stringify(item));
|
||||||
|
finalAnswer = item;
|
||||||
|
}else{
|
||||||
|
$(idName).each((i, el) => {
|
||||||
|
if(i == instanceCount){
|
||||||
|
const item = $(el).text();
|
||||||
|
finalAnswer = item.split(prefix).reverse()[0];
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
console.log(`Answer: ${finalAnswer}`);
|
console.log(`Answer: ${finalAnswer}`);
|
||||||
res.send(`Answer: ${finalAnswer}`);
|
res.send(`Answer: ${finalAnswer}`);
|
||||||
res.end();
|
res.end();
|
||||||
|
|||||||
Reference in New Issue
Block a user