欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬取东方财富的利润表数据记录

程序员文章站 2022-06-16 08:09:54
...

想看看各家上市公司的研发费用,所以去东方财富里面抓取数据。

1抓取准备工作

爬取东方财富的利润表数据记录1首先找到研发费用的数据的key名字
2找去找到是如何访问得到这些数据
查看源代码

爬取东方财富的利润表数据记录得到研发费用的名字是RDEXP,再去看是访问了什么url获取了资源的
爬取东方财富的利润表数据记录从名字看就知道是通过ajax获取数据的,所以我们进去看ajax返回数据有没有复合要求的,找到了
http://f10.eastmoney.com/NewFinanceAnalysis/lrbAjax?companyType=4&reportDateType=0&reportType=1&endDate=&code=SH603893
从参数看code就是上市公司的代码,其他的参数,没关注。
返回的数据是一个json字符串,大家自己发掘。
从哪里获取上市公司的code呢?
同花顺有数据导出excel功能,可以把代码、市盈率、市值等指标导出,这样我们就有了所有的代码了。
2抓取数据代码
//由于同花顺导出来的xls,一直解析不了,不知道哪里有问题,所以我自己重新创建了一个xls,把同花顺的内容拷贝过去

package com.xreport2_demo;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.collections4.Put;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.BorderStyle;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.FillPatternType;
import org.apache.poi.ss.usermodel.Font;
import org.apache.poi.ss.usermodel.HorizontalAlignment;
import org.apache.poi.ss.usermodel.IndexedColors;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.matrix.ssm.service.impl.UserServiceImpl;

//http://f10.eastmoney.com/NewFinanceAnalysis/lrbAjax?companyType=4&reportDateType=0&reportType=1&endDate=&code=SH603893


public class HttpClientDemo {
	private static Logger logger = Logger.getLogger(HttpClientDemo.class);
	private static String columns[] = {"空白","代码",	"名称","所属行业","TTM市盈率","净利润","市净率","总市值","流通市值"};
	private static String columnsNew[] = {"空白	","代码",	"名称","所属行业","TTM市盈率","净利润","市净率","总市值","流通市值","研发费用","研发日期","研发费用与总市值比"};
//	private static String columns[] = {"代码"};
	public static void main(String[] args) throws Exception {
		//由于同花顺导出来的xls,一直解析不了,不知道哪里有问题,所以我自己重新创建了一个xls,把同花顺的内容拷贝过去
		//解析出excel中的数据
		List<Map<String, String>> list = readExcel("D://data//1.xls", columns);
//		int i=0;
		for(Map map : list){
//			if(i>=10){
//				break;
//			}
//			i++;
			//怕东方财富一下给我拉黑
			Thread.sleep((int)Math.random()*1000);
			//获取研发费用与研发日期,因为年报有的公司没有出
			getCostResarch(map);
		}
		//将list的值写入到新的excel
		exportExcel(list,"D://data//研发费用.xls");
	}
	/**
	 * 传入单元格中的
	 * @param map
	 * @return
	 */
	public static void getCostResarch(Map<String,String> map){
		   //获取url
		String companyCode =  map.get("代码");
		String surl = "http://f10.eastmoney.com/NewFinanceAnalysis/lrbAjax?companyType=4&reportDateType=0&reportType=1&endDate=&code="+companyCode;
	    BufferedReader br = null;
		try {
			URL url = new URL(surl);
			//下载数据
			InputStream is = url.openStream();
			br = new BufferedReader(new InputStreamReader(is,"utf-8"));
			String msg = null;
			while (null != (msg = br.readLine())){
				String tmp = StringEscapeUtils.unescapeJavaScript(msg);
				tmp = tmp.substring(1,tmp.length()-1);
				logger.info(msg);
				List<Map<String,String>> listObjectFir = (List<Map<String,String>>) JSONArray.parse(tmp);
				for(Map<String,String> data : listObjectFir){
					if(("2019/12/31 0:00:00").equals(data.get("REPORTDATE"))){
						map.put("RDEXP", data.get("RDEXP"));
						map.put("REPORTDATE", data.get("REPORTDATE"));
					}
					//3月份的数据在前面,用年报数据覆盖3月份数据
					if(("2020/3/31 0:00:00").equals(data.get("REPORTDATE"))){
						map.put("RDEXP", data.get("RDEXP"));
						map.put("REPORTDATE", data.get("REPORTDATE"));
					}
					
				}
			}
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}finally {
			if(br!=null){
				 try {
					br.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	}
	/**
	 * 读取excel
	 * @param filePath
	 * @param columns
	 * @return
	 */
	public static List<Map<String, String>> readExcel(String filePath,String columns[]) {
		Sheet sheet = null;
		Row row = null;
		Row rowHeader = null;
		List<Map<String, String>> list = null;
		String cellData = null;
		Workbook wb = null;
		if (filePath == null) {
			return null;
		}
		String extString = filePath.substring(filePath.lastIndexOf("."));
		InputStream is = null;
		try {
			is = new FileInputStream(filePath);
			if (".xls".equals(extString)) {
				wb = new HSSFWorkbook(is);
			} else if (".xlsx".equals(extString)) {
				wb = new XSSFWorkbook(is);
			} else {
				wb = null;
			}
			if (wb != null) {
				// 用来存放表中数据
				list = new ArrayList<Map<String, String>>();
				// 获取第一个sheet
				sheet = wb.getSheetAt(0);
				// 获取最大行数
				int rownum = sheet.getPhysicalNumberOfRows();
				// 获取第一行
				rowHeader = sheet.getRow(0);
				row = sheet.getRow(0);
				// 获取最大列数
				int colnum = row.getPhysicalNumberOfCells();
				for (int i = 1; i < rownum; i++) {
					Map<String, String> map = new LinkedHashMap<String, String>();
					row = sheet.getRow(i);
					if (row != null) {
						for (int j = 0; j < colnum; j++) {
							if(columns[j].equals(getCellFormatValue(rowHeader.getCell(j)))){
								cellData = (String) getCellFormatValue(row
										.getCell(j));
								map.put(columns[j], cellData);
							}
						}
					} else {
						break;
					}
					list.add(map);
				}
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return list;
	}
 
	/**	获取单个单元格数据
	 * @param cell
	 * @return
	 * @author lizixiang ,2018-05-08
	 */
	 private static String getCellFormatValue(Cell cell) {
	        if(cell==null){
	            return null;
	        }
	        String returnValue = null;
	        switch (cell.getCellType()) {
	            case NUMERIC:   //数字
	                Double doubleValue = cell.getNumericCellValue();
	                // 格式化科学计数法,取一位整数
	                DecimalFormat df = new DecimalFormat("0");
	                returnValue = df.format(doubleValue);
	                break;
	            case STRING:    //字符串
	                returnValue = cell.getStringCellValue();
	                break;
	            case BOOLEAN:   //布尔
	                Boolean booleanValue = cell.getBooleanCellValue();
	                returnValue = booleanValue.toString();
	                break;
	            case BLANK:     // 空值
	                break;
	            case FORMULA:   // 公式
	                returnValue = cell.getCellFormula();
	                break;
	            case ERROR:     // 故障
	                break;
	            default:
	                break;
	        }
	        return returnValue;
	    }

//	public static Object getCellFormatValue(Cell cell) {
//		Object cellValue = null;
//		if (cell != null) {
//			// 判断cell类型
//			switch (cell.getCellType()) {
//			case Cell.CELL_TYPE_NUMERIC: {
//				cellValue = String.valueOf(cell.getNumericCellValue());
//				break;
//			}
//			case Cell.CELL_TYPE_FORMULA: {
//				// 判断cell是否为日期格式
//				if (DateUtil.isCellDateFormatted(cell)) {
//					// 转换为日期格式YYYY-mm-dd
//					cellValue = cell.getDateCellValue();
//				} else {
//					// 数字
//					cellValue = String.valueOf(cell.getNumericCellValue());
//				}
//				break;
//			}
//			case Cell.CELL_TYPE_STRING: {
//				cellValue = cell.getRichStringCellValue().getString();
//				break;
//			}
//			default:
//				cellValue = "";
//			}
//		} else {
//			cellValue = "";
//		}
//		return cellValue;
//	}
	
	/**
	 * 到处数据
	 * @param list
	 */
	public static void exportExcel(List<Map<String, String>> list, String exportFilePath){
		 // 写入数据到工作簿对象内
        Workbook workbook = exportData(list);
        // 以文件的形式输出工作簿对象
        FileOutputStream fileOut = null;
        try {
            File exportFile = new File(exportFilePath);
            if (!exportFile.exists()) {
                exportFile.createNewFile();
            }

            fileOut = new FileOutputStream(exportFilePath);
            workbook.write(fileOut);
            fileOut.flush();
        } catch (Exception e) {
        	e.printStackTrace();
        } finally {
            try {
                if (null != fileOut) {
                    fileOut.close();
                }
                if (null != workbook) {
                    workbook.close();
                }
            } catch (IOException e) {
            	e.printStackTrace();
            }
        }

    }

	/**
	 * 生成Excel并写入数据信息
	 * 
	 * @param dataList
	 *            数据列表
	 * @return 写入数据后的工作簿对象
	 */
	public static Workbook exportData(List<Map<String, String>> dataList) {
		// 生成xlsx的Excel
//		Workbook workbook = new SXSSFWorkbook();

		// 如需生成xls的Excel,请使用下面的工作簿对象,注意后续输出时文件后缀名也需更改为xls
		Workbook workbook = new HSSFWorkbook();

		// 生成Sheet表,写入第一行的列头
		Sheet sheet = buildDataSheet(workbook);
		// 构建每行的数据内容
		int rowNum = 1;
		for (Map data :dataList) {
			if (data == null) {
				continue;
			}
			// 输出行数据
			Row row = sheet.createRow(rowNum++);
			convertDataToRow(data, row);
		}
		return workbook;
	}

	/**
	 * 生成sheet表,并写入第一行数据(列头)
	 * 
	 * @param workbook
	 *            工作簿对象
	 * @return 已经写入列头的Sheet
	 */
	private static Sheet buildDataSheet(Workbook workbook) {
		Sheet sheet = workbook.createSheet();
		// 设置列头宽度
		for (int i = 0; i < columnsNew.length; i++) {
			sheet.setColumnWidth(i, 4000);
		}
		// 设置默认行高
		sheet.setDefaultRowHeight((short) 400);
		// 构建头单元格样式
		CellStyle cellStyle = buildHeadCellStyle(sheet.getWorkbook());
		// 写入第一行各列的数据
		Row head = sheet.createRow(0);
		for (int i = 0; i < columnsNew.length; i++) {
			Cell cell = head.createCell(i);
			cell.setCellValue(columnsNew[i]);
			cell.setCellStyle(cellStyle);
		}
		return sheet;
	}

	/**
	 * 设置第一行列头的样式
	 * 
	 * @param workbook
	 *            工作簿对象
	 * @return 单元格样式对象
	 */
	private static CellStyle buildHeadCellStyle(Workbook workbook) {
		CellStyle style = workbook.createCellStyle();
		// 对齐方式设置
		style.setAlignment(HorizontalAlignment.CENTER);
		// 边框颜色和宽度设置
		style.setBorderBottom(BorderStyle.THIN);
		style.setBottomBorderColor(IndexedColors.BLACK.getIndex()); // 下边框
		style.setBorderLeft(BorderStyle.THIN);
		style.setLeftBorderColor(IndexedColors.BLACK.getIndex()); // 左边框
		style.setBorderRight(BorderStyle.THIN);
		style.setRightBorderColor(IndexedColors.BLACK.getIndex()); // 右边框
		style.setBorderTop(BorderStyle.THIN);
		style.setTopBorderColor(IndexedColors.BLACK.getIndex()); // 上边框
		// 设置背景颜色
		style.setFillForegroundColor(IndexedColors.GREY_25_PERCENT.getIndex());
		style.setFillPattern(FillPatternType.SOLID_FOREGROUND);
		// 粗体字设置
		Font font = workbook.createFont();
		font.setBold(true);
		style.setFont(font);
		return style;
	}

	/**
	 * 将数据转换成行
	 * 
	 * @param data
	 *            源数据
	 * @param row
	 *            行对象
	 * @return
	 */
	private static void convertDataToRow(Map<String, String> data, Row row) {
		int cellNum = 0;
		Cell cell;
		// 空白
		cell = row.createCell(cellNum++);
		cell.setCellValue(null == data.get("空白") ? "" : data.get("空白"));
		// 代码
		cell = row.createCell(cellNum++);
		cell.setCellValue(null == data.get("代码") ? "" : data.get("代码"));
		// 代码
		cell = row.createCell(cellNum++);
		cell.setCellValue(null == data.get("名称") ? "" : data.get("名称"));
		// 代码
		cell = row.createCell(cellNum++);
		cell.setCellValue(null == data.get("所属行业") ? "" : data.get("所属行业"));
		// 代码
		cell = row.createCell(cellNum++);
		cell.setCellValue(null == data.get("TTM市盈率") ? "" : data.get("TTM市盈率"));
		cell = row.createCell(cellNum++);
		cell.setCellValue(null == data.get("市净率") ? "" : data.get("市净率"));
		// 代码
		cell = row.createCell(cellNum++);
		cell.setCellValue(null == data.get("净利润") ? "" : data.get("净利润"));
		// 代码
		cell = row.createCell(cellNum++);
		cell.setCellValue(null == data.get("总市值") ? "" : data.get("总市值"));
		// 代码
		cell = row.createCell(cellNum++);
		cell.setCellValue(null == data.get("流通市值") ? "" : data.get("流通市值"));
		// 代码
		cell = row.createCell(cellNum++);
		cell.setCellValue(null == data.get("RDEXP") ? "" : data.get("RDEXP"));
		// 代码
		cell = row.createCell(cellNum++);
		cell.setCellValue(null == data.get("REPORTDATE") ? "" : data.get("REPORTDATE"));
		// 代码
		cell = row.createCell(cellNum++);
		if(data.get("RDEXP")==null||"".equals(data.get("RDEXP"))){
			cell.setCellValue("0");
		}else{
			try {
				cell.setCellValue(Double.parseDouble(data.get("RDEXP"))/Double.parseDouble(data.get("总市值")));
			} catch (Exception e) {
				cell.setCellValue("出错");
			}
		}
		
	}
}

<dependencies>
		<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-collections4 -->
		<dependency>
			<groupId>org.apache.commons</groupId>
			<artifactId>commons-collections4</artifactId>
			<version>4.4</version>
		</dependency>
		<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-ooxml-schemas</artifactId>
			<version>4.0.0</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi</artifactId>
			<version>4.0.1</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-ooxml</artifactId>
			<version>4.0.1</version>
		</dependency>
		<dependency>
			<groupId>org.apache.xmlbeans</groupId>
			<artifactId>xmlbeans</artifactId>
			<version>2.6.0</version>
		</dependency>
		<!-- <dependency> <groupId>com.bstek.ureport</groupId> <artifactId>ureport2-console</artifactId> 
			<version>2.2.9</version> </dependency> -->
		<!-- https://mvnrepository.com/artifact/commons-codec/commons-codec -->

		<dependency>
			<groupId>commons-codec</groupId>
			<artifactId>commons-codec</artifactId>
			<version>1.3</version>
		</dependency>
		<!-- https://mvnrepository.com/artifact/cn.bestwu/common-lang -->
		<!-- <dependency> <groupId>cn.bestwu</groupId> <artifactId>common-lang</artifactId> 
			<version>1.1.5</version> </dependency> -->
		<!-- https://mvnrepository.com/artifact/commons-lang/commons-lang -->
		<dependency>
			<groupId>commons-lang</groupId>
			<artifactId>commons-lang</artifactId>
			<version>2.6</version>
		</dependency>

		<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
		<!-- <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> 
			<version>3.10</version> </dependency> -->
		<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>fastjson</artifactId>
			<version>1.2.61</version>
		</dependency>

		<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpclient</artifactId>
			<version>4.0</version>
		</dependency>
		<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
		<!-- https://mvnrepository.com/artifact/commons-httpclient/commons-httpclient -->
		<dependency>
			<groupId>commons-httpclient</groupId>
			<artifactId>commons-httpclient</artifactId>
			<version>3.0</version>
		</dependency>

		<!-- https://mvnrepository.com/artifact/commons-logging/commons-logging -->
		<dependency>
			<groupId>commons-logging</groupId>
			<artifactId>commons-logging</artifactId>
			<version>1.1.1</version>
		</dependency>

结果
爬取东方财富的利润表数据记录

相关标签: 数据爬取 java