ocr and click

4/21/2026

const { mouse, screen, Region, Point, Button, getActiveWindow } = require("@nut-tree/nut-js");
 
const { execSync } = require("child_process");
 
const path = require("path");
 
const fs = require("fs");
 
const os = require("os");
 
 
// OCR 可执行文件路径（按平台选择）
 
const OCR_DIR = path.resolve(__dirname, `ocr-${process.platform}`);
 
const OCR_BIN = path.join(OCR_DIR, process.platform === "win32" ? "ocr.exe" : "ocr");
 
 
// 屏幕缩放比例（Retina 为 2），启动时自动检测
 
let scaleFactor = 2;
 
async function detectScale() {
 
const img = await screen.grab();
 
const logicalWidth = await screen.width();
 
scaleFactor = img.width / logicalWidth;
 
}
 
  
 
/**
 
* 截取全屏并保存到临时文件
 
* @returns {string} 截图文件路径
 
*/
 
async function captureScreen() {
 
const tmpFile = path.join(os.tmpdir(), `screen_${Date.now()}.png`);
 
const image = await screen.grab();
 
  
 
// nut-js screen.grab() 返回的是 Image 对象，需要手动保存为 PNG
 
const { width, height, data } = image;
 
writePNG(tmpFile, width, height, data);
 
return tmpFile;
 
}
 
  
 
/**
 
* 截取指定区域并保存
 
* @param {number} x
 
* @param {number} y
 
* @param {number} w
 
* @param {number} h
 
* @returns {string} 截图文件路径
 
*/
 
async function captureRegion(x, y, w, h) {
 
const tmpFile = path.join(os.tmpdir(), `region_${Date.now()}.png`);
 
const region = new Region(x, y, w, h);
 
const image = await screen.grabRegion(region);
 
  
 
const { width, height, data } = image;
 
writePNG(tmpFile, width, height, data);
 
return tmpFile;
 
}
 
  
 
/**
 
* 截取当前激活窗口
 
* @returns {{ imagePath: string, region: { left: number, top: number, width: number, height: number } }}
 
*/
 
async function captureActiveWindow() {
 
const win = await getActiveWindow();
 
const region = await win.region;
 
const tmpFile = path.join(os.tmpdir(), `window_${Date.now()}.png`);
 
const image = await screen.grabRegion(
 
new Region(region.left, region.top, region.width, region.height)
 
);
 
  
 
const { width, height, data } = image;
 
writePNG(tmpFile, width, height, data);
 
return { imagePath: tmpFile, region };
 
}
 
  
 
/**
 
* 将 RGBA raw data 写成 PNG 文件（无外部依赖，手写最小 PNG 编码）
 
*/
 
function writePNG(filePath, width, height, rgbaData) {
 
const zlib = require("zlib");
 
  
 
// PNG signature
 
const signature = Buffer.from([137, 80, 78, 71, 13, 10, 26, 10]);
 
  
 
// IHDR chunk
 
const ihdr = Buffer.alloc(13);
 
ihdr.writeUInt32BE(width, 0);
 
ihdr.writeUInt32BE(height, 4);
 
ihdr[8] = 8; // bit depth
 
ihdr[9] = 6; // color type: RGBA
 
ihdr[10] = 0; // compression
 
ihdr[11] = 0; // filter
 
ihdr[12] = 0; // interlace
 
  
 
// Build raw image data with filter bytes
 
const rowSize = width * 4 + 1; // +1 for filter byte
 
const rawData = Buffer.alloc(rowSize * height);
 
for (let y = 0; y < height; y++) {
 
rawData[y * rowSize] = 0; // filter: None
 
rgbaData.copy(rawData, y * rowSize + 1, y * width * 4, (y + 1) * width * 4);
 
}
 
  
 
const compressed = zlib.deflateSync(rawData);
 
  
 
const chunks = [];
 
chunks.push(signature);
 
chunks.push(makeChunk("IHDR", ihdr));
 
chunks.push(makeChunk("IDAT", compressed));
 
chunks.push(makeChunk("IEND", Buffer.alloc(0)));
 
  
 
fs.writeFileSync(filePath, Buffer.concat(chunks));
 
}
 
  
 
function makeChunk(type, data) {
 
const typeBuffer = Buffer.from(type, "ascii");
 
const length = Buffer.alloc(4);
 
length.writeUInt32BE(data.length, 0);
 
  
 
const crcInput = Buffer.concat([typeBuffer, data]);
 
const crc = crc32(crcInput);
 
const crcBuffer = Buffer.alloc(4);
 
crcBuffer.writeUInt32BE(crc >>> 0, 0);
 
  
 
return Buffer.concat([length, typeBuffer, data, crcBuffer]);
 
}
 
  
 
function crc32(buf) {
 
let table = crc32.table;
 
if (!table) {
 
table = crc32.table = new Uint32Array(256);
 
for (let i = 0; i < 256; i++) {
 
let c = i;
 
for (let j = 0; j < 8; j++) {
 
c = c & 1 ? 0xedb88320 ^ (c >>> 1) : c >>> 1;
 
}
 
table[i] = c;
 
}
 
}
 
let crc = 0xffffffff;
 
for (let i = 0; i < buf.length; i++) {
 
crc = table[(crc ^ buf[i]) & 0xff] ^ (crc >>> 8);
 
}
 
return (crc ^ 0xffffffff) >>> 0;
 
}
 
  
 
/**
 
* 调用 OCR 识别图片中的文字
 
* @param {string} imagePath 图片路径
 
* @returns {Array<{text: string, box: number[][]}>} OCR 识别结果
 
*/
 
function runOCR(imagePath) {
 
const output = execSync(`"${OCR_BIN}" "${imagePath}"`, {
 
encoding: "utf-8",
 
timeout: 30000,
 
});
 
return JSON.parse(output.trim());
 
}
 
  
 
/**
 
* 从 OCR box 坐标计算中心点
 
* box 格式: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]（四个角的坐标）
 
*/
 
function getBoxCenter(box) {
 
const xs = box.map((p) => p[0]);
 
const ys = box.map((p) => p[1]);
 
const cx = (Math.min(...xs) + Math.max(...xs)) / 2;
 
const cy = (Math.min(...ys) + Math.max(...ys)) / 2;
 
return { x: Math.round(cx / scaleFactor), y: Math.round(cy / scaleFactor) };
 
}
 
  
 
/**
 
* 截屏 -> OCR 识别 -> 查找目标文字 -> 点击
 
* @param {string} targetText 要查找并点击的文字
 
* @param {object} options 可选配置
 
* @param {boolean} options.fuzzy 是否模糊匹配（包含即匹配），默认 false（精确匹配）
 
* @param {boolean} options.doubleClick 是否双击，默认 false
 
* @param {boolean} options.activeWindow 是否只截取当前激活窗口，默认 false（全屏）
 
* @returns {object} { success, clickedAt, matchedText }
 
*/
 
async function findAndClick(targetText, options = {}) {
 
const { fuzzy = false, doubleClick = false, activeWindow = false } = options;
 
  
 
await detectScale();
 
  
 
console.log(`[1/3] 截屏中...`);
 
let imagePath;
 
let offsetX = 0, offsetY = 0;
 
if (activeWindow) {
 
const { imagePath: wp, region } = await captureActiveWindow();
 
imagePath = wp;
 
offsetX = region.left;
 
offsetY = region.top;
 
console.log(` 窗口区域: (${region.left}, ${region.top}, ${region.width}x${region.height})`);
 
} else {
 
imagePath = await captureScreen();
 
}
 
console.log(` 截图已保存: ${imagePath}`);
 
  
 
console.log(`[2/3] OCR 识别中...`);
 
const results = runOCR(imagePath);
 
console.log(` 识别到 ${results.length} 个文字区域:`);
 
results.forEach((r) => console.log(` - "${r.text}"`));
 
  
 
console.log(`[3/3] 查找文字 "${targetText}"（${fuzzy ? "模糊" : "精确"}匹配）...`);
 
const match = results.find((r) =>
 
fuzzy ? r.text.includes(targetText) : r.text === targetText
 
);
 
  
 
// 清理临时截图
 
try {
 
fs.unlinkSync(imagePath);
 
} catch {}
 
  
 
if (!match) {
 
console.log(` 未找到文字 "${targetText}"`);
 
return { success: false, matchedText: null, clickedAt: null };
 
}
 
  
 
// OCR 坐标是相对截图的像素坐标，getBoxCenter 已除以 scaleFactor 转为逻辑坐标
 
// 窗口偏移 offsetX/offsetY 本身就是逻辑坐标，直接相加
 
const boxCenter = getBoxCenter(match.box);
 
const center = {
 
x: boxCenter.x + offsetX,
 
y: boxCenter.y + offsetY,
 
};
 
console.log(` 找到 "${match.text}"，屏幕坐标: (${center.x}, ${center.y})`);
 
  
 
await mouse.setPosition(new Point(center.x, center.y));
 
if (doubleClick) {
 
await mouse.doubleClick(Button.LEFT);
 
} else {
 
await mouse.click(Button.LEFT);
 
}
 
console.log(` 已${doubleClick ? "双击" : "点击"} (${center.x}, ${center.y})`);
 
  
 
return { success: true, matchedText: match.text, clickedAt: center };
 
}
 
  
 
/**
 
* 仅截屏并 OCR，不点击
 
* @param {object} options
 
* @param {boolean} options.activeWindow 是否只截取当前激活窗口
 
* @returns {Array<{text: string, box: number[][], center: {x: number, y: number}}>}
 
*/
 
async function ocrScreen(options = {}) {
 
const { activeWindow = false } = options;
 
await detectScale();
 
let imagePath;
 
if (activeWindow) {
 
({ imagePath } = await captureActiveWindow());
 
} else {
 
imagePath = await captureScreen();
 
}
 
const results = runOCR(imagePath);
 
try {
 
fs.unlinkSync(imagePath);
 
} catch {}
 
return results.map((r) => ({
 
...r,
 
center: getBoxCenter(r.box),
 
}));
 
}
 
  
 
// 导出 API
 
module.exports = {
 
captureScreen,
 
captureRegion,
 
captureActiveWindow,
 
runOCR,
 
findAndClick,
 
ocrScreen,
 
};
 
  
 
// 命令行直接调用: node index.js "要点击的文字"
 
if (require.main === module) {
 
const target = process.argv[2];
 
if (!target) {
 
console.log("用法: node index.js <要查找并点击的文字> [--fuzzy] [--double] [--window]");
 
console.log("");
 
console.log("示例:");
 
console.log(' node index.js "文件" # 精确匹配，点击文字为"文件"的区域');
 
console.log(' node index.js "文件" --fuzzy # 模糊匹配，点击包含"文件"的区域');
 
console.log(' node index.js "文件" --window # 仅在当前激活窗口中搜索');
 
console.log(' node index.js "文件" --double # 双击');
 
console.log("");
 
console.log("仅 OCR 识别（不点击）:");
 
console.log(" node index.js --scan");
 
console.log(" node index.js --scan --window # 仅扫描当前窗口");
 
process.exit(1);
 
}
 
  
 
if (target === "--scan") {
 
const activeWindow = process.argv.includes("--window");
 
ocrScreen({ activeWindow })
 
.then((results) => {
 
console.log(`${activeWindow ? "窗口" : "屏幕"} OCR 识别结果:`);
 
results.forEach((r) => {
 
console.log(` "${r.text}" -> (${r.center.x}, ${r.center.y})`);
 
});
 
})
 
.catch(console.error);
 
} else {
 
const fuzzy = process.argv.includes("--fuzzy");
 
const doubleClick = process.argv.includes("--double");
 
const activeWindow = process.argv.includes("--window");
 
findAndClick(target, { fuzzy, doubleClick, activeWindow })
 
.then((result) => {
 
if (!result.success) process.exit(1);
 
})
 
.catch(console.error);
 
}
 
}