num_Map_tasks = max[${Mapred.min.split.size},
min(${dfs.block.size}, ${Mapred.max.split.size})]
/*在index_test_table表的id字段上创建索引*/
create index idx on table index_test_table(id)
as 'org.apache.Hadoop.Hive.ql.index.compact.CompactIndexHandler' with deferred rebuild;
alter index idx on index_test_table rebuild;
/*索引的剪裁。找到上面建的索引表,根据你最终要用的查询条件剪裁一下。*/
/*如果你想跟RDBMS一样建完索引就用,那是不行的,会直接报错,这也是其麻烦的地方*/
create table my_index
as select _bucketname, `_offsets`
from default__index_test_table_idx__ where id = 10;
/*现在可以用索引了,注意最终查询条件跟上面的剪裁条件一致*/
set Hive.index.compact.file = /user/Hive/warehouse/my_index;
set Hive.input.format = org.apache.Hadoop.Hive.ql.index.compact.HiveCompactIndexInputFormat;
select count(*) from index_test_table where id = 10;
set Hive.enforce.bucketing = true;
insert overwrite table Map_join_test
select * from Map_join_source_data;
2013-08-31 09:08:43 Starting to launch local task to process Map join; maximum memory = 1004929024
2013-08-31 09:08:45 Processing rows: 200000 Hashtable size: 199999 Memory usage: 38823016 rate: 0.039
2013-08-31 09:08:46 Processing rows: 300000 Hashtable size: 299999 Memory usage: 56166968 rate: 0.056
……
2013-08-31 09:12:39 Processing rows: 4900000 Hashtable size: 4899999 Memory usage: 896968104 rate: 0.893
2013-08-31 09:12:47 Processing rows: 5000000 Hashtable size: 4999999 Memory usage: 922733048 rate: 0.918
Execution failed with exit status: 2
Obtaining error information
Task failed!
Task ID:
Stage-4
2013-08-31 09:20:39 Starting to launch local task to process Map join; maximum memory = 1004929024
2013-08-31 09:20:41 Processing rows: 200000 Hashtable size: 199999 Memory usage: 38844832 rate: 0.039
2013-08-31 09:20:42 Processing rows: 275567 Hashtable size: 275567 Memory usage: 51873632 rate: 0.052
2013-08-31 09:20:42 Dump the hashtable into file: file:/tmp/Hadoop/Hive_2013-08-31_21-20-37_444_1135806892100127714/-local-10003/HashTable-Stage-1/MapJoin-a-10-000000_0。hashtable
2013-08-31 09:20:46 Upload 1 File to: file:/tmp/Hadoop/Hive_2013-08-31_21-20-37_444_1135806892100127714/-local-10003/HashTable-Stage-1/MapJoin-a-10-000000_0。hashtable File size: 11022975
2013-08-31 09:20:47 Processing rows: 300000 Hashtable size: 24432 Memory usage: 8470976 rate: 0.008
2013-08-31 09:20:47 Processing rows: 400000 Hashtable size: 124432 Memory usage: 25368080 rate: 0.025
2013-08-31 09:20:48 Processing rows: 500000 Hashtable size: 224432 Memory usage: 42968080 rate: 0.043
2013-08-31 09:20:49 Processing rows: 551527 Hashtable size: 275960 Memory usage: 52022488 rate: 0.052
2013-08-31 09:20:49 Dump the hashtable into file: file:/tmp/Hadoop/Hive_2013-08-31_21-20-37_444_1135806892100127714/-local-10003/HashTable-Stage-1/MapJoin-a-10-000001_0。hashtable
……
set Hive.optimize.bucketMapjoin.Sortedmerge = true;
set Hive.input.format = org.apache.Hadoop.Hive.ql.io.BucketizedHiveInputFormat;
/*改写前*/
select a, count(distinct b) as c from tbl group by a;
/*改写后*/
select a, count(*) as c
from (select distinct a, b from tbl) group by a;
select a.* from
(
select a.*
from (select * from logs where user_id = 0) a
join (select * from users where user_id = 0) b
on a.user_id = b.user_id
union all
select a.*
from logs a join users b
on a.user_id <> 0 and a.user_id = b.user_id
)t;
select * from
(
select count(*) from logs
where log_date = 20130801 and item_id = 1
union all
select count(*) from logs
where log_date = 20130802 and item_id = 2
union all
select count(*) from logs
where log_date = 20130803 and item_id = 3
)t
select count(*)
from logs group by user_id
having (count(case when page_name = ‘a’ then 1 end) > 0
and count(case when page_name = ‘b’ then 1 end) > 0)
欢迎光临 168大数据 (http://www.bi168.cn/) | Powered by Discuz! X3.2 |