2019-04-15 12:48:08 卢浮宫 版权声明:本文为站长原创文章,转载请写明出处
一、业务背景
今天接收到业务需求是把国航app数据抓取并写入到mongodb数据库中,这里简单记录下。
二、抓取数据地址
手机app数据地址是通过fiddler 4 来进行抓取的(后面会补充一个fiddler的使用教程),那么我们通过抓取到的url再浏览器中进行调试并拿到数据后请求的url
简单说一下,这个数据请求地址可能隐藏在调用的js中,具体情况具体分析吧
三、拿到数据源
①通过httpclient进行数据请求,获取到数据源(一版是JSON数据或是JSON字符串)
②根据需要格式化数据(推荐使用下阿里巴巴的fastJSON来进行数据处理,个人感觉比较好用)
四、写入到数据库
①、第一次操作,使用了基本的操作,后面就使用mongoTemplate了
②创建一个mongoDBUtil类用来进行所有的数据库操作,核心代码如下
private static final String MONGO_HOST = "你的数据库地址"; private static final Integer MONGO_PORT = "你的端口"; private static final String MONGO_DB_NAME = "你的数据库名称"; private static final String MONGO_USERNAME = "你的用户名"; private static final String MONGO_PASSWORD = "你的密码"; private static final String MONGO_COLLECTION_NAME = "需要操作的数据库表"; public static DBCollection mongoDbConnect() throws UnknownHostException { // 获取Mongo客户端 MongoClient mongoClient = new MongoClient(MONGO_HOST, MONGO_PORT); DB db = mongoClient.getDB(MONGO_DB_NAME); // 2.1用户名&密码校验 @SuppressWarnings("deprecation") boolean auth = db.authenticate(MONGO_USERNAME, MONGO_PASSWORD.toCharArray()); if (!auth) { System.out.println(MONGO_DB_NAME + "连接失败!"); return null; } System.out.println(MONGO_DB_NAME + "连接成功!"); DBCollection collection = db.getCollection(MONGO_COLLECTION_NAME); return collection; } //其他表的连接设置 test public static DBCollection mongoDbConnectForXa() throws UnknownHostException { // 获取Mongo客户端 MongoClient mongoClient = new MongoClient(MONGO_HOST, MONGO_PORT); DB db = mongoClient.getDB(MONGO_DB_NAME); // 2.1用户名&密码校验 @SuppressWarnings("deprecation") boolean auth = db.authenticate(MONGO_USERNAME, MONGO_PASSWORD.toCharArray()); if (!auth) { System.out.println(MONGO_DB_NAME + "连接失败!"); return null; } System.out.println(MONGO_DB_NAME + "连接成功!"); DBCollection collection = db.getCollection("xaIgnioreFlightno"); return collection; } public static void mongoDbDisConnect() throws UnknownHostException{ //断开并释放资源 MongoClient mongoClient = new MongoClient(MONGO_HOST, MONGO_PORT); mongoClient.close(); mongoClient = null; } public static void doInsert(DBCollection collection,List<DBObject> dbList){ //批量写入数据 collection.insert(dbList); } public static void doRemove(DBCollection collection, BasicDBObject document){ //删除单条数据 collection.remove(document); } public static void doRemoveById(DBCollection collection, String airShiftId){ //根据id删除记录 BasicDBObject document = new BasicDBObject(); document.put("_id", airShiftId); collection.remove(document); } public static void doRemoveMany(DBCollection collection, List<String> idList){ //删除一批数据 BasicDBObject query = new BasicDBObject(); for(int i=0;i<idList.size();i++){ query.put("_id", idList.get(i)); collection.remove(query); } } public static List<Object> getNeedUploadData(){ List<Object> rltList = new ArrayList<Object>(); try { DBCollection collection = mongoDbConnect(); BasicDBObject searchObj = new BasicDBObject(); searchObj.put("buildDate", DateUtil.getNowDate()); DBCursor cursor = collection.find(searchObj).sort(new BasicDBObject("flightno",-1)); if (cursor.hasNext()) { while (cursor.hasNext()) { rltList.add(cursor.next()); } } else { System.out.println("当前暂无需同步数据!"); } } catch (UnknownHostException e) { e.printStackTrace(); } return rltList; }
五、业务操作相关代码如下(以一个数据写入为例):
try { for (int i = 0; i < ffList.size(); i++) { JSONObject jsonObj = JSON.parseObject(ffList.get(i).toString()); String flightListStr = jsonObj.getString("flightList"); JSONArray flightListArray = JSON.parseArray(flightListStr); JSONObject flightListObj = JSON.parseObject(flightListArray.get(0).toString()); airId = jsonObj.getString("id"); airClass = Mycrawler.class.toString(); airAir = jsonObj.getString("pnr"); airDpt = jsonObj.getString("orgdstDes").split("-")[0]; airArr = jsonObj.getString("orgdstDes").split("-")[1]; airDepartDate = jsonObj.getString("calendarDate"); airBegindate = airDepartDate + " "+ flightListObj.getString("departureTime"); airEnddate = airDepartDate + " "+ flightListObj.getString("arrivalTime"); airFlightno = flightListObj.getString("flightNo"); airCabin = flightListObj.getString("seatClass"); airSeatSale = flightListObj.getString("seatNum"); airTicketprice = jsonObj.getString("ticketPrice"); airTimeLimit = jsonObj.getString("endTime"); // 格式化DBObject数据 //这里的逻辑只负责元数据写入及删除,不做其他处理(保证数据抓取速度) // Map<String, String> checkRlt = dataCheck(collection, airAir,airDepartDate, airBegindate, airFlightno, airCabin,airTicketprice); // String checkState = checkRlt.get("checkState"); // switch (checkState) { // case "new": // idList.add(airId); // break; // case "old": // case "newButSame": // continue; // } DBObject document = getDbObjectTmp(collection, airId, airClass,airAir, airDpt, airArr, airDepartDate, airBegindate,airEnddate, airFlightno, airCabin, airSeatSale,airTicketprice, airCreatedate, airTimeLimit); dbList.add(document); System.out.println(Thread.currentThread().getName() + " productId_ " + productId + " data " + document); } } catch (Exception e) { ExceptionUtil.logException("解析航班数据异常:", e); } finally { MongoDBUtil.doInsert(collection, dbList); }