ocr and click

const { mouse, screen, Region, Point, Button, getActiveWindow } = require("@nut-tree/nut-js"); const { execSync } = require("child_process"); const path = require("path"); const fs = require("fs"); const os = require("os"); // OCR 可执行文件路径(按平台选择) const OCR_DIR = path.resolve(__dirname, `ocr-${process.platform}`); const OCR_BIN = path.join(OCR_DIR, process.platform === "win32" ? "ocr.exe" : "ocr"); // 屏幕缩放比例(Retina 为 2),启动时自动检测 let scaleFactor = 2; async function detectScale() { const img = await screen.grab(); const logicalWidth = await screen.width(); scaleFactor = img.width / logicalWidth; } /** * 截取全屏并保存到临时文件 * @returns {string} 截图文件路径 */ async function captureScreen() { const tmpFile = path.join(os.tmpdir(), `screen_${Date.now()}.png`); const image = await screen.grab(); // nut-js screen.grab() 返回的是 Image 对象,需要手动保存为 PNG const { width, height, data } = image; writePNG(tmpFile, width, height, data); return tmpFile; } /** * 截取指定区域并保存 * @param {number} x * @param {number} y * @param {number} w * @param {number} h * @returns {string} 截图文件路径 */ async function captureRegion(x, y, w, h) { const tmpFile = path.join(os.tmpdir(), `region_${Date.now()}.png`); const region = new Region(x, y, w, h); const image = await screen.grabRegion(region); const { width, height, data } = image; writePNG(tmpFile, width, height, data); return tmpFile; } /** * 截取当前激活窗口 * @returns {{ imagePath: string, region: { left: number, top: number, width: number, height: number } }} */ async function captureActiveWindow() { const win = await getActiveWindow(); const region = await win.region; const tmpFile = path.join(os.tmpdir(), `window_${Date.now()}.png`); const image = await screen.grabRegion( new Region(region.left, region.top, region.width, region.height) ); const { width, height, data } = image; writePNG(tmpFile, width, height, data); return { imagePath: tmpFile, region }; } /** * 将 RGBA raw data 写成 PNG 文件(无外部依赖,手写最小 PNG 编码) */ function writePNG(filePath, width, height, rgbaData) { const zlib = require("zlib"); // PNG signature const signature = Buffer.from([137, 80, 78, 71, 13, 10, 26, 10]); // IHDR chunk const ihdr = Buffer.alloc(13); ihdr.writeUInt32BE(width, 0); ihdr.writeUInt32BE(height, 4); ihdr[8] = 8; // bit depth ihdr[9] = 6; // color type: RGBA ihdr[10] = 0; // compression ihdr[11] = 0; // filter ihdr[12] = 0; // interlace // Build raw image data with filter bytes const rowSize = width * 4 + 1; // +1 for filter byte const rawData = Buffer.alloc(rowSize * height); for (let y = 0; y < height; y++) { rawData[y * rowSize] = 0; // filter: None rgbaData.copy(rawData, y * rowSize + 1, y * width * 4, (y + 1) * width * 4); } const compressed = zlib.deflateSync(rawData); const chunks = []; chunks.push(signature); chunks.push(makeChunk("IHDR", ihdr)); chunks.push(makeChunk("IDAT", compressed)); chunks.push(makeChunk("IEND", Buffer.alloc(0))); fs.writeFileSync(filePath, Buffer.concat(chunks)); } function makeChunk(type, data) { const typeBuffer = Buffer.from(type, "ascii"); const length = Buffer.alloc(4); length.writeUInt32BE(data.length, 0); const crcInput = Buffer.concat([typeBuffer, data]); const crc = crc32(crcInput); const crcBuffer = Buffer.alloc(4); crcBuffer.writeUInt32BE(crc >>> 0, 0); return Buffer.concat([length, typeBuffer, data, crcBuffer]); } function crc32(buf) { let table = crc32.table; if (!table) { table = crc32.table = new Uint32Array(256); for (let i = 0; i < 256; i++) { let c = i; for (let j = 0; j < 8; j++) { c = c & 1 ? 0xedb88320 ^ (c >>> 1) : c >>> 1; } table[i] = c; } } let crc = 0xffffffff; for (let i = 0; i < buf.length; i++) { crc = table[(crc ^ buf[i]) & 0xff] ^ (crc >>> 8); } return (crc ^ 0xffffffff) >>> 0; } /** * 调用 OCR 识别图片中的文字 * @param {string} imagePath 图片路径 * @returns {Array<{text: string, box: number[][]}>} OCR 识别结果 */ function runOCR(imagePath) { const output = execSync(`"${OCR_BIN}" "${imagePath}"`, { encoding: "utf-8", timeout: 30000, }); return JSON.parse(output.trim()); } /** * 从 OCR box 坐标计算中心点 * box 格式: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]](四个角的坐标) */ function getBoxCenter(box) { const xs = box.map((p) => p[0]); const ys = box.map((p) => p[1]); const cx = (Math.min(...xs) + Math.max(...xs)) / 2; const cy = (Math.min(...ys) + Math.max(...ys)) / 2; return { x: Math.round(cx / scaleFactor), y: Math.round(cy / scaleFactor) }; } /** * 截屏 -> OCR 识别 -> 查找目标文字 -> 点击 * @param {string} targetText 要查找并点击的文字 * @param {object} options 可选配置 * @param {boolean} options.fuzzy 是否模糊匹配(包含即匹配),默认 false(精确匹配) * @param {boolean} options.doubleClick 是否双击,默认 false * @param {boolean} options.activeWindow 是否只截取当前激活窗口,默认 false(全屏) * @returns {object} { success, clickedAt, matchedText } */ async function findAndClick(targetText, options = {}) { const { fuzzy = false, doubleClick = false, activeWindow = false } = options; await detectScale(); console.log(`[1/3] 截屏中...`); let imagePath; let offsetX = 0, offsetY = 0; if (activeWindow) { const { imagePath: wp, region } = await captureActiveWindow(); imagePath = wp; offsetX = region.left; offsetY = region.top; console.log(` 窗口区域: (${region.left}, ${region.top}, ${region.width}x${region.height})`); } else { imagePath = await captureScreen(); } console.log(` 截图已保存: ${imagePath}`); console.log(`[2/3] OCR 识别中...`); const results = runOCR(imagePath); console.log(` 识别到 ${results.length} 个文字区域:`); results.forEach((r) => console.log(` - "${r.text}"`)); console.log(`[3/3] 查找文字 "${targetText}"(${fuzzy ? "模糊" : "精确"}匹配)...`); const match = results.find((r) => fuzzy ? r.text.includes(targetText) : r.text === targetText ); // 清理临时截图 try { fs.unlinkSync(imagePath); } catch {} if (!match) { console.log(` 未找到文字 "${targetText}"`); return { success: false, matchedText: null, clickedAt: null }; } // OCR 坐标是相对截图的像素坐标,getBoxCenter 已除以 scaleFactor 转为逻辑坐标 // 窗口偏移 offsetX/offsetY 本身就是逻辑坐标,直接相加 const boxCenter = getBoxCenter(match.box); const center = { x: boxCenter.x + offsetX, y: boxCenter.y + offsetY, }; console.log(` 找到 "${match.text}",屏幕坐标: (${center.x}, ${center.y})`); await mouse.setPosition(new Point(center.x, center.y)); if (doubleClick) { await mouse.doubleClick(Button.LEFT); } else { await mouse.click(Button.LEFT); } console.log(` 已${doubleClick ? "双击" : "点击"} (${center.x}, ${center.y})`); return { success: true, matchedText: match.text, clickedAt: center }; } /** * 仅截屏并 OCR,不点击 * @param {object} options * @param {boolean} options.activeWindow 是否只截取当前激活窗口 * @returns {Array<{text: string, box: number[][], center: {x: number, y: number}}>} */ async function ocrScreen(options = {}) { const { activeWindow = false } = options; await detectScale(); let imagePath; if (activeWindow) { ({ imagePath } = await captureActiveWindow()); } else { imagePath = await captureScreen(); } const results = runOCR(imagePath); try { fs.unlinkSync(imagePath); } catch {} return results.map((r) => ({ ...r, center: getBoxCenter(r.box), })); } // 导出 API module.exports = { captureScreen, captureRegion, captureActiveWindow, runOCR, findAndClick, ocrScreen, }; // 命令行直接调用: node index.js "要点击的文字" if (require.main === module) { const target = process.argv[2]; if (!target) { console.log("用法: node index.js <要查找并点击的文字> [--fuzzy] [--double] [--window]"); console.log(""); console.log("示例:"); console.log(' node index.js "文件" # 精确匹配,点击文字为"文件"的区域'); console.log(' node index.js "文件" --fuzzy # 模糊匹配,点击包含"文件"的区域'); console.log(' node index.js "文件" --window # 仅在当前激活窗口中搜索'); console.log(' node index.js "文件" --double # 双击'); console.log(""); console.log("仅 OCR 识别(不点击):"); console.log(" node index.js --scan"); console.log(" node index.js --scan --window # 仅扫描当前窗口"); process.exit(1); } if (target === "--scan") { const activeWindow = process.argv.includes("--window"); ocrScreen({ activeWindow }) .then((results) => { console.log(`${activeWindow ? "窗口" : "屏幕"} OCR 识别结果:`); results.forEach((r) => { console.log(` "${r.text}" -> (${r.center.x}, ${r.center.y})`); }); }) .catch(console.error); } else { const fuzzy = process.argv.includes("--fuzzy"); const doubleClick = process.argv.includes("--double"); const activeWindow = process.argv.includes("--window"); findAndClick(target, { fuzzy, doubleClick, activeWindow }) .then((result) => { if (!result.success) process.exit(1); }) .catch(console.error); } }