Added optical character recognition.

This commit is contained in:
2023-09-28 12:09:35 -05:00
parent 321b8fe50c
commit b09fc5af11
11 changed files with 2521 additions and 40 deletions

2
.gitignore vendored
View File

@@ -1,2 +1,2 @@
*.swp
node_modules node_modules

View File

@@ -12,7 +12,7 @@ export const options = {
let scenarios = { let scenarios = {
base:{ base:{
executor "per-vu-iterations", executor: "per-vu-iterations",
vus: 1, vus: 1,
iterations: 1, iterations: 1,
maxDuration: "5m" maxDuration: "5m"

Binary file not shown.

View File

@@ -4,6 +4,8 @@ import { SharedArray } from "k6/data"
import http from "k6/http" import http from "k6/http"
import { URL } from 'https://jslib.k6.io/url/1.0.0/index.js'; import { URL } from 'https://jslib.k6.io/url/1.0.0/index.js';
import { scrapeDataMod } from "./scrapeDataModule.js";
export const options = { export const options = {
scenarios:{}, scenarios:{},
teardownTimeout: "120s", teardownTimeout: "120s",
@@ -20,33 +22,7 @@ let scenarios = {
} }
const baseURL = "http://localhost:8001"; const baseURL = "http://localhost:8001";
let scrapeData = [ let scrapeData = scrapeDataMod;
{
url: "https://www.ebay.com/itm/154843103473?_trkparms=amclksrc%3DITM%26aid%3D777008%26algo%3DPERSONAL.TOPIC%26ao%3D1%26asc%3D20230823115209%26meid%3D547ade272f0245a3a38d3f775c940b40%26pid%3D101800%26rk%3D1%26rkt%3D1%26sd%3D394822890601%26itm%3D154843103473%26pmt%3D1%26noa%3D1%26pg%3D4375194%26algv%3DRecentlyViewedItemsV2SignedOut%26brand%3DApple&_trksid=p4375194.c101800.m5481&_trkparms=parentrq%3Ad7d5ca4718a0ab4c13690428fffff6d6%7Cpageci%3A22c4128a-5d61-11ee-9d02-ee1c8ae0bfdf%7Ciid%3A1%7Cvlpname%3Avlp_homepage",
prefix:"US ",
idName: ".ux-textspans",
instanceCount: 36,
},
{
url: "https://www.ebay.com/itm/254429385780?_trkparms=amclksrc%3DITM%26aid%3D777008%26algo%3DPERSONAL.TOPIC%26ao%3D1%26asc%3D20230823115209%26meid%3D547ade272f0245a3a38d3f775c940b40%26pid%3D101800%26rk%3D1%26rkt%3D1%26sd%3D394822890601%26itm%3D254429385780%26pmt%3D1%26noa%3D1%26pg%3D4375194%26algv%3DRecentlyViewedItemsV2SignedOut%26brand%3DApple&_trksid=p4375194.c101800.m5481&_trkparms=parentrq%3Ad7d5ca4718a0ab4c13690428fffff6d6%7Cpageci%3A22c4128a-5d61-11ee-9d02-ee1c8ae0bfdf%7Ciid%3A1%7Cvlpname%3Avlp_homepage",
prefix: "US ",
idName: ".ux-textspans",
instanceCount: 39
},
{
url:"https://www.ebay.com/itm/125187190152?_trkparms=amclksrc%3DITM%26aid%3D777008%26algo%3DPERSONAL.TOPIC%26ao%3D1%26asc%3D20230823115%20%20%20%20209%26meid%3D547ade272f0245a3a38d3f775c940b40%26pid%3D101800%26rk%3D1%26rkt%3D1%26sd%3D394822890601%26itm%3D125187190152%26pmt%3D0%26noa%%20%20%20%203D1%26pg%3D4375194%26algv%3DRecentlyViewedItemsV2SignedOut%26brand%3DMiller&_trksid=p4375194.c101800.m5481&_trkparms=parentrq%3Ad7d5ca471%20%20%20%208a0ab4c13690428fffff6d6%7Cpageci%3A22c4128a-5d61-11ee-9d02-ee1c8ae0bfdf%7Ciid%3A1%7Cvlpname%3Avlp_homepage",
prefix: "US ",
idName: ".ux-textspans",
instanceCount: 14
},
{
url: "https://www.ebay.com/itm/355014752155?_trkparms=amclksrc%3DITM%26aid%3D1110006%26algo%3DHOMESPLICE.SIM%26ao%3D1%26asc%3D20201210111314%26meid%3Db9d7bfc448e846fd88d6af6196122543%26pid%3D101195%26rk%3D5%26rkt%3D12%26sd%3D125187190152%26itm%3D355014752155%26pmt%3D1%26noa%3D0%26pg%3D4429486%26algv%3DSimplAMLv11WebTrimmedV3MskuWithLambda85KnnRecallV1V2V4ItemNrtInQueryAndCassiniVisualRankerAndBertRecallWithVMEV3CPCAuto%26brand%3DMiller&_trksid=p4429486.c101195.m1851&amdata=cksum%3A355014752155b9d7bfc448e846fd88d6af6196122543%7Cenc%3AAQAIAAABUObhgc4Nk8%252BdtAwOww4FKLaj%252FQ5qqgDlQCuqZA43WcPFUWDERCUugbbOk7XQv0JXlBfqCg2xKF3WcPghxGMFw2oSlXvfExEaMYr7I7LmrHcP6czY1wIMt0ORyKiCWt95xldincyyBx3g%252BNDW%252B%252FhWUgTaBhK6xAm%252BJIbCOMehu%252Bdw7Cl7%252B5IYh7smXk3oe11K772Gk2jRH3EKtZgP6B%252FlgnbOdlzXvdfx9nm%252BOFv14nym91rSP%252Fp0wbIOb9ayjgcJ%252BFrPBZFmP28lX44UnMF2tb1luPAriUk40GUO3lqhKbBiRBHaRdiQQMcQYqGH0PMIMw9ARpndx%252BhzDgl11zXK577uYvKJmCTZG%252BJsYG0kBH8jTJWhtdTz3Z7HEvndOTAx0XNofblr0%252FSfGh1VnTJs5jXxD1%252Fn86pkxTf7HyqpXKsaDdR64EbDneXYdEMMx2UixQ%253D%253D%7Campid%3APL_CLK%7Cclp%3A4429486&epid=722188521",
prefix: "US ",
idName: ".ux-textspans",
instanceCount: 14
}
]
let ENV = {}; let ENV = {};
if(__ENV.scenario){ if(__ENV.scenario){

32
Tests/scrapeDataModule.js Normal file
View File

@@ -0,0 +1,32 @@
export const scrapeDataMod = [
{
url: "https://www.ebay.com/itm/154843103473?_trkparms=amclksrc%3DITM%26aid%3D777008%26algo%3DPERSONAL.TOPIC%26ao%3D1%26asc%3D20230823115209%26meid%3D547ade272f0245a3a38d3f775c940b40%26pid%3D101800%26rk%3D1%26rkt%3D1%26sd%3D394822890601%26itm%3D154843103473%26pmt%3D1%26noa%3D1%26pg%3D4375194%26algv%3DRecentlyViewedItemsV2SignedOut%26brand%3DApple&_trksid=p4375194.c101800.m5481&_trkparms=parentrq%3Ad7d5ca4718a0ab4c13690428fffff6d6%7Cpageci%3A22c4128a-5d61-11ee-9d02-ee1c8ae0bfdf%7Ciid%3A1%7Cvlpname%3Avlp_homepage",
prefix: "US ",
idName: ".ux-textspans",
instanceCount: 36,
},
{
url: "https://www.ebay.com/itm/254429385780?_trkparms=amclksrc%3DITM%26aid%3D777008%26algo%3DPERSONAL.TOPIC%26ao%3D1%26asc%3D20230823115209%26meid%3D547ade272f0245a3a38d3f775c940b40%26pid%3D101800%26rk%3D1%26rkt%3D1%26sd%3D394822890601%26itm%3D254429385780%26pmt%3D1%26noa%3D1%26pg%3D4375194%26algv%3DRecentlyViewedItemsV2SignedOut%26brand%3DApple&_trksid=p4375194.c101800.m5481&_trkparms=parentrq%3Ad7d5ca4718a0ab4c13690428fffff6d6%7Cpageci%3A22c4128a-5d61-11ee-9d02-ee1c8ae0bfdf%7Ciid%3A1%7Cvlpname%3Avlp_homepage",
prefix: "US ",
idName: ".ux-textspans",
instanceCount: 39,
},
{
url: "https://www.ebay.com/itm/125187190152?_trkparms=amclksrc%3DITM%26aid%3D777008%26algo%3DPERSONAL.TOPIC%26ao%3D1%26asc%3D20230823115%20%20%20%20209%26meid%3D547ade272f0245a3a38d3f775c940b40%26pid%3D101800%26rk%3D1%26rkt%3D1%26sd%3D394822890601%26itm%3D125187190152%26pmt%3D0%26noa%%20%20%20%203D1%26pg%3D4375194%26algv%3DRecentlyViewedItemsV2SignedOut%26brand%3DMiller&_trksid=p4375194.c101800.m5481&_trkparms=parentrq%3Ad7d5ca471%20%20%20%208a0ab4c13690428fffff6d6%7Cpageci%3A22c4128a-5d61-11ee-9d02-ee1c8ae0bfdf%7Ciid%3A1%7Cvlpname%3Avlp_homepage",
prefix: "US ",
idName: ".ux-textspans",
instanceCount: 14,
},
{
url: "https://www.ebay.com/itm/355014752155?_trkparms=amclksrc%3DITM%26aid%3D1110006%26algo%3DHOMESPLICE.SIM%26ao%3D1%26asc%3D20201210111314%26meid%3Db9d7bfc448e846fd88d6af6196122543%26pid%3D101195%26rk%3D5%26rkt%3D12%26sd%3D125187190152%26itm%3D355014752155%26pmt%3D1%26noa%3D0%26pg%3D4429486%26algv%3DSimplAMLv11WebTrimmedV3MskuWithLambda85KnnRecallV1V2V4ItemNrtInQueryAndCassiniVisualRankerAndBertRecallWithVMEV3CPCAuto%26brand%3DMiller&_trksid=p4429486.c101195.m1851&amdata=cksum%3A355014752155b9d7bfc448e846fd88d6af6196122543%7Cenc%3AAQAIAAABUObhgc4Nk8%252BdtAwOww4FKLaj%252FQ5qqgDlQCuqZA43WcPFUWDERCUugbbOk7XQv0JXlBfqCg2xKF3WcPghxGMFw2oSlXvfExEaMYr7I7LmrHcP6czY1wIMt0ORyKiCWt95xldincyyBx3g%252BNDW%252B%252FhWUgTaBhK6xAm%252BJIbCOMehu%252Bdw7Cl7%252B5IYh7smXk3oe11K772Gk2jRH3EKtZgP6B%252FlgnbOdlzXvdfx9nm%252BOFv14nym91rSP%252Fp0wbIOb9ayjgcJ%252BFrPBZFmP28lX44UnMF2tb1luPAriUk40GUO3lqhKbBiRBHaRdiQQMcQYqGH0PMIMw9ARpndx%252BhzDgl11zXK577uYvKJmCTZG%252BJsYG0kBH8jTJWhtdTz3Z7HEvndOTAx0XNofblr0%252FSfGh1VnTJs5jXxD1%252Fn86pkxTf7HyqpXKsaDdR64EbDneXYdEMMx2UixQ%253D%253D%7Campid%3APL_CLK%7Cclp%3A4429486&epid=722188521",
prefix: "US ",
idName: ".ux-textspans",
instanceCount: 14,
},
{
url: "https://www.amazon.com/Schiit-Hybrid-Headphone-Preamp-Black/dp/B0BC6P1WF6/?_encoding=UTF8&pd_rd_w=eJxP2&content-id=amzn1.sym.455a8027-f810-447e-9547-49291aea9c0f&pf_rd_p=455a8027-f810-447e-9547-49291aea9c0f&pf_rd_r=6YB6M8XWKWQ25NWQCXWB&pd_rd_wg=wqc1f&pd_rd_r=21224218-cc39-4619-93db-85797ee56eac&ref_=pd_gw_bmx_gp_rc4cdj8o",
prefix: "",
idName: ".a-offscreen",
instanceCount: 0,
},
];

BIN
eng.traineddata Normal file

Binary file not shown.

2357
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -7,7 +7,10 @@
"mongo-sanitize": "^1.1.0", "mongo-sanitize": "^1.1.0",
"mongodb": "^6.1.0", "mongodb": "^6.1.0",
"node-cron": "^3.0.2", "node-cron": "^3.0.2",
"node-tesseract-ocr": "^2.2.1",
"pug": "^3.0.2", "pug": "^3.0.2",
"request": "^2.88.2" "puppeteer": "^21.3.6",
"request": "^2.88.2",
"tesseract.js": "^5.0.0"
} }
} }

BIN
savedImages/amazonTest.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

BIN
savedImages/test.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

119
server.js
View File

@@ -15,6 +15,8 @@ var https = require('https');
var MongoClient = require('mongodb').MongoClient; var MongoClient = require('mongodb').MongoClient;
var sanitize = require('mongo-sanitize'); var sanitize = require('mongo-sanitize');
var cheerio = require("cheerio"); //jQuery Substitute var cheerio = require("cheerio"); //jQuery Substitute
var puppeteer = require("puppeteer");
const { createWorker } = require("tesseract.js");
const pug = require('pug'); const pug = require('pug');
@@ -26,11 +28,109 @@ app.set('view engine', "pug")
app.use("/images", express.static(path.join(__dirname, '/images'))); app.use("/images", express.static(path.join(__dirname, '/images')));
app.use("/static", express.static(path.join(__dirname, "/static"))); app.use("/static", express.static(path.join(__dirname, "/static")));
const baseFilePath = "./savedImages";
app.get('/', function (req, res) { app.get('/', function (req, res) {
res.send('Hello World'); res.send('Hello World');
}); });
app.get("/takeScreenshot", async function(req, res){
let url = req.query["url"];
let name = req.query["name"];
puppeteer.launch({
headless: "new",
defaultViewport: {
width: 1280,
height: 2000,
},
})
.then(async (browser) => {
const page = await browser.newPage();
await page.goto(url);
await page.screenshot({ path: `${baseFilePath}/${name}.png` });
await browser.close();
});
res.send("204 no content");
res.end();
});
app.get("/getValueFromImage", async function(req, res){
console.log("Pulling info from image");
let name = req.query["name"];
let top = req.query["top"]; //374
let left = req.query["left"]; //596
let width = req.query["width"];//75
let height = req.query["height"];//30
const img = fs.readFileSync(`${baseFilePath}/${name}.png`);
const worker = await createWorker("eng");
const rectangle = { left: left, top: top, width: width, height: height };
let myText = "Default";
(async () => {
const {
data: { text },
} = await worker.recognize(img,
{ rectangle }
);
res.send(text);
res.end();
console.log(text);
await worker.terminate();
})();
});
app.get("/getVisualPrice", async function(req, res){
console.log("Getting Visual Price");
let url = req.query["url"];
let name = req.query["name"];
let top = req.query["top"]; //374
let left = req.query["left"]; //596
let width = req.query["width"];//75
let height = req.query["height"];//30
puppeteer.launch({
headless: "new",
defaultViewport: {
width: 1280,
height: 2000,
},
})
.then(async (browser) => {
const page = await browser.newPage();
await page.goto(url);
await page.screenshot({ path: `${baseFilePath}/${name}.png` });
await browser.close();
console.log("Reading Image");
const img = fs.readFileSync(`${baseFilePath}/${name}.png`);
const worker = await createWorker("eng");
const rectangle = {
left: left,
top: top,
width: width,
height: height,
};
let myText = "Default";
(async () => {
console.log("Finding text");
const {
data: { text },
} = await worker.recognize(img, { rectangle });
res.send(text);
res.end();
console.log(text);
await worker.terminate();
})();
});
});
app.get("/getPrice", async function(req, res){ app.get("/getPrice", async function(req, res){
try{ try{
console.log(`Getting Price!`); console.log(`Getting Price!`);
@@ -39,7 +139,7 @@ app.get("/getPrice", async function(req, res){
let prefix = req.query["prefix"]; let prefix = req.query["prefix"];
let instanceCount = parseInt(req.query["instanceCount"]); let instanceCount = parseInt(req.query["instanceCount"]);
let idName = req.query["idName"]; let idName = req.query["idName"];
let finalAnswer = "";
console.log(`Get Price Info:\n console.log(`Get Price Info:\n
Prefix: ${prefix}\n Prefix: ${prefix}\n
@@ -50,13 +150,28 @@ app.get("/getPrice", async function(req, res){
request(url, (err, resp, html) => { request(url, (err, resp, html) => {
const $ = cheerio.load(html); const $ = cheerio.load(html);
finalAnswer = ""; console.log(html);
let finalAnswer = "";
if(instanceCount == 0){
console.log("Getting single instance");
let item = {};
console.log($(idName));
$(idName).each((i, el) => {
//console.log("item");
//console.log(el);
item = $(el).text();
});
console.log(JSON.stringify(item));
finalAnswer = item;
}else{
$(idName).each((i, el) => { $(idName).each((i, el) => {
if(i == instanceCount){ if(i == instanceCount){
const item = $(el).text(); const item = $(el).text();
finalAnswer = item.split(prefix).reverse()[0]; finalAnswer = item.split(prefix).reverse()[0];
} }
}); });
}
console.log(`Answer: ${finalAnswer}`); console.log(`Answer: ${finalAnswer}`);
res.send(`Answer: ${finalAnswer}`); res.send(`Answer: ${finalAnswer}`);
res.end(); res.end();