ocr and click
const { mouse, screen, Region, Point, Button, getActiveWindow } = require("@nut-tree/nut-js");
const { execSync } = require("child_process");
const path = require("path");
const fs = require("fs");
const os = require("os");
// OCR 可执行文件路径(按平台选择)
const OCR_DIR = path.resolve(__dirname, `ocr-${process.platform}`);
const OCR_BIN = path.join(OCR_DIR, process.platform === "win32" ? "ocr.exe" : "ocr");
// 屏幕缩放比例(Retina 为 2),启动时自动检测
let scaleFactor = 2;
async function detectScale() {
const img = await screen.grab();
const logicalWidth = await screen.width();
scaleFactor = img.width / logicalWidth;
}
/**
* 截取全屏并保存到临时文件
* @returns {string} 截图文件路径
*/
async function captureScreen() {
const tmpFile = path.join(os.tmpdir(), `screen_${Date.now()}.png`);
const image = await screen.grab();
// nut-js screen.grab() 返回的是 Image 对象,需要手动保存为 PNG
const { width, height, data } = image;
writePNG(tmpFile, width, height, data);
return tmpFile;
}
/**
* 截取指定区域并保存
* @param {number} x
* @param {number} y
* @param {number} w
* @param {number} h
* @returns {string} 截图文件路径
*/
async function captureRegion(x, y, w, h) {
const tmpFile = path.join(os.tmpdir(), `region_${Date.now()}.png`);
const region = new Region(x, y, w, h);
const image = await screen.grabRegion(region);
const { width, height, data } = image;
writePNG(tmpFile, width, height, data);
return tmpFile;
}
/**
* 截取当前激活窗口
* @returns {{ imagePath: string, region: { left: number, top: number, width: number, height: number } }}
*/
async function captureActiveWindow() {
const win = await getActiveWindow();
const region = await win.region;
const tmpFile = path.join(os.tmpdir(), `window_${Date.now()}.png`);
const image = await screen.grabRegion(
new Region(region.left, region.top, region.width, region.height)
);
const { width, height, data } = image;
writePNG(tmpFile, width, height, data);
return { imagePath: tmpFile, region };
}
/**
* 将 RGBA raw data 写成 PNG 文件(无外部依赖,手写最小 PNG 编码)
*/
function writePNG(filePath, width, height, rgbaData) {
const zlib = require("zlib");
// PNG signature
const signature = Buffer.from([137, 80, 78, 71, 13, 10, 26, 10]);
// IHDR chunk
const ihdr = Buffer.alloc(13);
ihdr.writeUInt32BE(width, 0);
ihdr.writeUInt32BE(height, 4);
ihdr[8] = 8; // bit depth
ihdr[9] = 6; // color type: RGBA
ihdr[10] = 0; // compression
ihdr[11] = 0; // filter
ihdr[12] = 0; // interlace
// Build raw image data with filter bytes
const rowSize = width * 4 + 1; // +1 for filter byte
const rawData = Buffer.alloc(rowSize * height);
for (let y = 0; y < height; y++) {
rawData[y * rowSize] = 0; // filter: None
rgbaData.copy(rawData, y * rowSize + 1, y * width * 4, (y + 1) * width * 4);
}
const compressed = zlib.deflateSync(rawData);
const chunks = [];
chunks.push(signature);
chunks.push(makeChunk("IHDR", ihdr));
chunks.push(makeChunk("IDAT", compressed));
chunks.push(makeChunk("IEND", Buffer.alloc(0)));
fs.writeFileSync(filePath, Buffer.concat(chunks));
}
function makeChunk(type, data) {
const typeBuffer = Buffer.from(type, "ascii");
const length = Buffer.alloc(4);
length.writeUInt32BE(data.length, 0);
const crcInput = Buffer.concat([typeBuffer, data]);
const crc = crc32(crcInput);
const crcBuffer = Buffer.alloc(4);
crcBuffer.writeUInt32BE(crc >>> 0, 0);
return Buffer.concat([length, typeBuffer, data, crcBuffer]);
}
function crc32(buf) {
let table = crc32.table;
if (!table) {
table = crc32.table = new Uint32Array(256);
for (let i = 0; i < 256; i++) {
let c = i;
for (let j = 0; j < 8; j++) {
c = c & 1 ? 0xedb88320 ^ (c >>> 1) : c >>> 1;
}
table[i] = c;
}
}
let crc = 0xffffffff;
for (let i = 0; i < buf.length; i++) {
crc = table[(crc ^ buf[i]) & 0xff] ^ (crc >>> 8);
}
return (crc ^ 0xffffffff) >>> 0;
}
/**
* 调用 OCR 识别图片中的文字
* @param {string} imagePath 图片路径
* @returns {Array<{text: string, box: number[][]}>} OCR 识别结果
*/
function runOCR(imagePath) {
const output = execSync(`"${OCR_BIN}" "${imagePath}"`, {
encoding: "utf-8",
timeout: 30000,
});
return JSON.parse(output.trim());
}
/**
* 从 OCR box 坐标计算中心点
* box 格式: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]](四个角的坐标)
*/
function getBoxCenter(box) {
const xs = box.map((p) => p[0]);
const ys = box.map((p) => p[1]);
const cx = (Math.min(...xs) + Math.max(...xs)) / 2;
const cy = (Math.min(...ys) + Math.max(...ys)) / 2;
return { x: Math.round(cx / scaleFactor), y: Math.round(cy / scaleFactor) };
}
/**
* 截屏 -> OCR 识别 -> 查找目标文字 -> 点击
* @param {string} targetText 要查找并点击的文字
* @param {object} options 可选配置
* @param {boolean} options.fuzzy 是否模糊匹配(包含即匹配),默认 false(精确匹配)
* @param {boolean} options.doubleClick 是否双击,默认 false
* @param {boolean} options.activeWindow 是否只截取当前激活窗口,默认 false(全屏)
* @returns {object} { success, clickedAt, matchedText }
*/
async function findAndClick(targetText, options = {}) {
const { fuzzy = false, doubleClick = false, activeWindow = false } = options;
await detectScale();
console.log(`[1/3] 截屏中...`);
let imagePath;
let offsetX = 0, offsetY = 0;
if (activeWindow) {
const { imagePath: wp, region } = await captureActiveWindow();
imagePath = wp;
offsetX = region.left;
offsetY = region.top;
console.log(` 窗口区域: (${region.left}, ${region.top}, ${region.width}x${region.height})`);
} else {
imagePath = await captureScreen();
}
console.log(` 截图已保存: ${imagePath}`);
console.log(`[2/3] OCR 识别中...`);
const results = runOCR(imagePath);
console.log(` 识别到 ${results.length} 个文字区域:`);
results.forEach((r) => console.log(` - "${r.text}"`));
console.log(`[3/3] 查找文字 "${targetText}"(${fuzzy ? "模糊" : "精确"}匹配)...`);
const match = results.find((r) =>
fuzzy ? r.text.includes(targetText) : r.text === targetText
);
// 清理临时截图
try {
fs.unlinkSync(imagePath);
} catch {}
if (!match) {
console.log(` 未找到文字 "${targetText}"`);
return { success: false, matchedText: null, clickedAt: null };
}
// OCR 坐标是相对截图的像素坐标,getBoxCenter 已除以 scaleFactor 转为逻辑坐标
// 窗口偏移 offsetX/offsetY 本身就是逻辑坐标,直接相加
const boxCenter = getBoxCenter(match.box);
const center = {
x: boxCenter.x + offsetX,
y: boxCenter.y + offsetY,
};
console.log(` 找到 "${match.text}",屏幕坐标: (${center.x}, ${center.y})`);
await mouse.setPosition(new Point(center.x, center.y));
if (doubleClick) {
await mouse.doubleClick(Button.LEFT);
} else {
await mouse.click(Button.LEFT);
}
console.log(` 已${doubleClick ? "双击" : "点击"} (${center.x}, ${center.y})`);
return { success: true, matchedText: match.text, clickedAt: center };
}
/**
* 仅截屏并 OCR,不点击
* @param {object} options
* @param {boolean} options.activeWindow 是否只截取当前激活窗口
* @returns {Array<{text: string, box: number[][], center: {x: number, y: number}}>}
*/
async function ocrScreen(options = {}) {
const { activeWindow = false } = options;
await detectScale();
let imagePath;
if (activeWindow) {
({ imagePath } = await captureActiveWindow());
} else {
imagePath = await captureScreen();
}
const results = runOCR(imagePath);
try {
fs.unlinkSync(imagePath);
} catch {}
return results.map((r) => ({
...r,
center: getBoxCenter(r.box),
}));
}
// 导出 API
module.exports = {
captureScreen,
captureRegion,
captureActiveWindow,
runOCR,
findAndClick,
ocrScreen,
};
// 命令行直接调用: node index.js "要点击的文字"
if (require.main === module) {
const target = process.argv[2];
if (!target) {
console.log("用法: node index.js <要查找并点击的文字> [--fuzzy] [--double] [--window]");
console.log("");
console.log("示例:");
console.log(' node index.js "文件" # 精确匹配,点击文字为"文件"的区域');
console.log(' node index.js "文件" --fuzzy # 模糊匹配,点击包含"文件"的区域');
console.log(' node index.js "文件" --window # 仅在当前激活窗口中搜索');
console.log(' node index.js "文件" --double # 双击');
console.log("");
console.log("仅 OCR 识别(不点击):");
console.log(" node index.js --scan");
console.log(" node index.js --scan --window # 仅扫描当前窗口");
process.exit(1);
}
if (target === "--scan") {
const activeWindow = process.argv.includes("--window");
ocrScreen({ activeWindow })
.then((results) => {
console.log(`${activeWindow ? "窗口" : "屏幕"} OCR 识别结果:`);
results.forEach((r) => {
console.log(` "${r.text}" -> (${r.center.x}, ${r.center.y})`);
});
})
.catch(console.error);
} else {
const fuzzy = process.argv.includes("--fuzzy");
const doubleClick = process.argv.includes("--double");
const activeWindow = process.argv.includes("--window");
findAndClick(target, { fuzzy, doubleClick, activeWindow })
.then((result) => {
if (!result.success) process.exit(1);
})
.catch(console.error);
}
}