欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

HiveSql解析(基于AST)

程序员文章站 2023-11-08 21:46:10
由于项目中要和大数据团队交互,需要解析出hive中用到的表和最外层的列,后续可能还要用到各个表和字段的血缘关系,网上搜了一圈,貌似只有一个答案,基本都是复制一个模板的。而且看起来貌似也不太符合我需要的,所以想着自己解析出想要的东西,勉强够用,记录一下。 1 import com.google.com ......

由于项目中要和大数据团队交互,需要解析出hive中用到的表和最外层的列,后续可能还要用到各个表和字段的血缘关系,网上搜了一圈,貌似只有一个答案,基本都是复制一个模板的。而且看起来貌似也不太符合我需要的,所以想着自己解析出想要的东西,勉强够用,记录一下。

  1 import com.google.common.base.joiner;
  2 import com.google.common.collect.lists;
  3 import com.google.common.collect.maps;
  4 import lombok.extern.slf4j.slf4j;
  5 import org.apache.commons.collections.collectionutils;
  6 import org.apache.hadoop.hive.ql.lib.node;
  7 import org.apache.hadoop.hive.ql.parse.astnode;
  8 import org.apache.hadoop.hive.ql.parse.parsedriver;
  9 
 10 import java.util.*;
 11 import java.util.stream.collectors;
 12 
 13 import static org.apache.hadoop.hive.ql.parse.hiveparser.*;
 14 
 15 /**
 16  * @author chentiefeng
 17  * @date 2019/10/21 13:51
 18  */
 19 @slf4j
 20 public class hivesqlparse {
 21     private parsedriver pd = new parsedriver();
 22     /**
 23      * 原始表(表名,别名)
 24      */
 25     private list<string[]> sourcetable = lists.newarraylist();
 26     /**
 27      * 插入表
 28      */
 29     private list<string> inserttables = lists.newarraylist();
 30     /**
 31      * 最外层列
 32      */
 33     private list<string> outermostcolumns = lists.newarraylist();
 34     /**
 35      * 插入分区信息(分区列,分区值)
 36      */
 37     private map<string, string> partitionmap = maps.newhashmap();
 38     /**
 39      * 最外层sel节点
 40      */
 41     private astnode outermostselnode = null;
 42     /**
 43      * 最外层insert节点
 44      */
 45     private astnode outermostinsertnode = null;
 46     /**
 47      * 放置 解析表栈
 48      */
 49     private stack<hivetableparseinfo> tableparseinfoselstack = new stack<>();
 50     private stack<hivetableparseinfo> tableparseinfofromstack = new stack<>();
 51     /**
 52      * 表关系解析信息,不包含原始表
 53      */
 54     private hivetableparseinfo tableparseinfo = null;
 55 
 56     public hivesqlparse() {
 57     }
 58 
 59     public hivesqlparse(string sql) {
 60         parse(sql);
 61     }
 62 
 63     /**
 64      * sql解析
 65      *
 66      * @param sql
 67      */
 68     public void parse(string sql) {
 69         try {
 70             astnode ast = pd.parse(sql);
 71             log.info("hivesql={},asttree={}", sql, ast.tostringtree());
 72             parsenode(ast);
 73             insert(outermostinsertnode);
 74             outermostcolumns(outermostselnode);
 75             sourcetable.removeif(arr -> arr[0].equals(inserttables.get(0)));
 76         } catch (exception e) {
 77             log.error(e.getmessage(), e);
 78             throw new runtimeexception(e);
 79         }
 80     }
 81 
 82     private void parsenode(astnode ast) {
 83         if (collectionutils.isnotempty(ast.getchildren())) {
 84             for (node child : ast.getchildren()) {
 85                 astnode cc = (astnode) child;
 86                 switch (cc.gettoken().gettype()) {
 87                     case tok_insert:
 88                         outermostinsertnode = cc;
 89                         break;
 90                     case tok_tabname:
 91                         string tablename = joiner.on(".").join(cc.getchildren().stream().map(n -> ((astnode) n).gettext()).collect(collectors.tolist()));
 92                         astnode ccchild = (astnode) cc.getparent().getchild(cc.getparent().getchildcount() - 1);
 93                         hivetableparseinfo sourcetableparseinfo = new hivetableparseinfo();
 94                         if (ccchild.gettoken().gettype() == tok_tabname) {
 95                             sourcetable.add(new string[]{tablename, ""});
 96                             sourcetableparseinfo.setalias("");
 97                         } else {
 98                             sourcetable.add(new string[]{tablename, ccchild.gettext()});
 99                             sourcetableparseinfo.setalias(ccchild.gettext());
100                         }
101                         sourcetableparseinfo.setname(tablename);
102                         if (!tableparseinfofromstack.empty()) {
103                             tableparseinfofromstack.pop().gettables().add(sourcetableparseinfo);
104                         }
105                         break;
106                     case tok_query:
107                         astnode ccc = (astnode) cc.getparent().getchild(cc.getparent().getchildcount() - 1);
108                         if (ccc.gettoken().gettype() != tok_query) {
109                             hivetableparseinfo table = new hivetableparseinfo();
110                             table.setalias(ccc.gettext());
111                             tableparseinfoselstack.push(table);
112                             tableparseinfofromstack.push(table);
113                         }
114                         break;
115                     case tok_select:
116                     case tok_selectdi:
117                         hivetableparseinfo pop = tableparseinfoselstack.pop();
118                         if (!tableparseinfoselstack.empty()) {
119                             hivetableparseinfo father = tableparseinfoselstack.peek();
120                             if (objects.nonnull(father)) {
121                                 father.gettables().add(pop);
122                             }
123                         } else {
124                             tableparseinfo = pop;
125                         }
126                         parsecolumns(cc, pop);
127                         continue;
128                     default:
129                 }
130                 parsenode(cc);
131             }
132         }
133     }
134 
135     private void insert(astnode cn) {
136         if (collectionutils.isempty(cn.getchildren())) {
137             return;
138         }
139         for (node child : cn.getchildren()) {
140             astnode cc = (astnode) child;
141             switch (cc.gettoken().gettype()) {
142                 case tok_insert_into:
143                 case tok_destination:
144                     inserttable(cn);
145                     continue;
146                 case tok_select:
147                     outermostselnode = cn;
148                     continue;
149                 default:
150             }
151             insert(cc);
152         }
153     }
154 
155     private void parsecolumns(astnode cc, hivetableparseinfo table) {
156         for (node node : cc.getchildren()) {
157             astnode tokselexpr = (astnode) node;
158             hivetableparseinfo.hivetablecolumnparseinfo column = new hivetableparseinfo.hivetablecolumnparseinfo();
159             string alias = getselexpralias(tokselexpr);
160             column.setname(alias);
161             parsecolumn(tokselexpr, column);
162             table.getcolumns().add(column);
163         }
164     }
165 
166 
167     private void parsecolumn(astnode tokselexpr, hivetableparseinfo.hivetablecolumnparseinfo column) {
168         if (collectionutils.isempty(tokselexpr.getchildren())) {
169             return;
170         }
171         for (node child : tokselexpr.getchildren()) {
172             astnode cc = (astnode) child;
173             if (cc.gettoken().gettype() == tok_table_or_col) {
174                 astnode ccc = (astnode) cc.getparent().getchild(cc.getparent().getchildcount() - 1);
175                 string[] item;
176                 if (ccc.gettoken().gettype() == tok_table_or_col) {
177                     item = new string[]{cc.getchild(0).gettext(), ""};
178                 } else {
179                     item = new string[]{ccc.gettext(), cc.getchild(0).gettext()};
180                 }
181                 optional<string[]> any = column.getsourcelist().stream().filter(s -> arrays.equals(item, s)).findany();
182                 if (!any.ispresent()) {
183                     column.getsourcelist().add(item);
184                 }
185                 continue;
186             }
187             parsecolumn(cc, column);
188         }
189     }
190 
191     /**
192      * 插入信息
193      *
194      * @param cn
195      */
196     private void inserttable(astnode cn) {
197         if (collectionutils.isempty(cn.getchildren())) {
198             return;
199         }
200         for (node child : cn.getchildren()) {
201             astnode cc = (astnode) child;
202             switch (cc.gettoken().gettype()) {
203                 case tok_tabname:
204                     string tablename = joiner.on(".").join(cc.getchildren().stream().map(n -> ((astnode) n).gettext()).collect(collectors.tolist()));
205                     inserttables.add(tablename);
206                     break;
207                 case tok_partval:
208                     if (cc.getchildcount() == 2) {
209                         partitionmap.put(cc.getchild(0).gettext(), cc.getchild(1).gettext());
210                     } else {
211                         partitionmap.put(cc.getchild(0).gettext(), null);
212                     }
213                     break;
214                 default:
215             }
216             inserttable(cc);
217         }
218     }
219 
220     /**
221      * 最外层列
222      *
223      * @param cn
224      */
225     private void outermostcolumns(astnode cn) {
226         if (collectionutils.isempty(cn.getchildren())) {
227             return;
228         }
229         for (node cnchild : cn.getchildren()) {
230             astnode cc = (astnode) cnchild;
231             if (cc.gettoken().gettype() == tok_selexpr) {
232                 string alias = getselexpralias(cc);
233                 outermostcolumns.add(alias);
234                 continue;
235             }
236             outermostcolumns(cc);
237         }
238     }
239 
240     /**
241      * 列别名获取
242      *
243      * @param cc
244      * @return
245      */
246     private string getselexpralias(astnode cc) {
247         astnode child = (astnode) cc.getchild(cc.getchildcount() - 1);
248         if (child.gettoken().gettype() == tok_table_or_col || child.gettoken().gettype() == dot) {
249             return child.getchild(child.getchildcount() - 1).gettext();
250         } else {
251             return child.gettext();
252         }
253     }
254 
255     public list<string> getoutermostcolumns() {
256         return outermostcolumns;
257     }
258 
259     public list<string> getsourcetables() {
260         return sourcetable.stream().map(t -> t[0]).distinct().collect(collectors.tolist());
261     }
262 
263     public string getinserttable() {
264         return collectionutils.isnotempty(inserttables) ? inserttables.get(0) : null;
265     }
266 
267     public map<string, string> getpartition() {
268         return partitionmap;
269     }
270 
271     public hivetableparseinfo gettableparseinfo() {
272         return tableparseinfo;
273     }
274 
275     public static void main(string[] args) {
276         string sql23 = "insert overwrite table risk_event partition(year='2019',dt) select t.ops as order_no,t.id_no ,concat(t.consumer_no,'aa') dd,aadx from (select concat(a.opt_id,b.opt_id) as ops,b.id_no from ods.arc_event a left outer join ods.arc_user b on a.consumer_no = b.consumer_no) t left outer join (select order_no from arc_verify where dt = '20191023') t1 on t.consumer_no = t1.consumer_no";
277 //        string sql23 = "insert overwrite table riskt_eventpartition select opt_id from arc_event a inner join arc_user b";
278 //        string sql23 = "insert overwrite table riskt_eventpartition select opt_id from arc_event";
279 //        string sql23 = "select sum(case when rcw.eventid=2 and rcw.method = 'sendevent' then 1 else 0 end) as successcnt," +
280 //                "       sum(case when rcw.eventid=4 and rcw.method = 'risklevel' then 1 else 0 end) as paycnt," +
281 //                "       sum(case when rcw.eventid=2 and rcw.method = 'sendevent' then 1 else 0 end)/sum(case when rcw.eventid=4 and rcw.method = 'risklevel' then 1 else 0 end) as rate" +
282 //                "  from (\n" +
283 //                "        select distinct payorderid," +
284 //                "               eventid," +
285 //                "               method" +
286 //                "          from log.pay_rc_warden_event_basic" +
287 //                "         where dt = '20180715'" +
288 //                "       ) rcw";
289         hivesqlparse hivesqlparse = new hivesqlparse(sql23);
290         system.out.println(hivesqlparse.getsourcetables());
291         system.out.println(hivesqlparse.getoutermostcolumns());
292         system.out.println(hivesqlparse.getinserttable());
293         system.out.println(hivesqlparse.getpartition());
294         system.out.println(hivesqlparse.gettableparseinfo());
295     }
296 }