探讨下Tag标签的数据库设计（千万级数据量）

发表于 2019-9-8 19:55:18

马上注册，结交更多数据大咖，获取更多知识干货，轻松玩转大数据

您需要登录才可以下载或查看，没有帐号？立即注册

x

本帖最后由 168主编于 2019-9-8 20:08 编辑

现在博客都有tag标签的功能，如何设计千万级数据量Tag数据库呢？现在把功能限制一下

tag需求：
（0）假定作品来自一个表（Poesy）,每个作品tag不超过5个
（1）根据tag可以找到所有包括该tag的作品,可以查看tag包含作品的数量。
(2)用户可以根据自己添加的tag来查看自己的作品,查看tag包括的作品的数量
(3)用户可以查看所有tag,可以查看所有tag包括的作品
(3)排序问题：按照作品访问量和时间排序

个人想了两套设计方案

方案一：在作品表里增加一个varchar(100)的tag列,各个tag以 , 分割，详见下：

--建表（作品表）：
if object_id('Poesy','u') is not null
drop table Poesy
go

create table Poesy
(
id int identity(1,1),
Title varchar(100),
Tag varchar(100)
)
go
alter table  Poesy add constraint pk_Posy primary key(Id)
go

if object_id('Tags','u') is not null
drop table Tags
go

create table Tags
(
Id int identity(1,1),
TagName Varchar(30),
TagCount int,  --作品数量
TagCreatDate datetime,
TagTips int --访问量
)
go
alter table  Tags add constraint pk_Tags primary key(Id)
go

--面向用户的tag
if object_id('UserTags','u') is not null
drop table UserTags
go

create table UserTags
(
Id int identity(1,1),
TagUserId int,
TagName Varchar(30),
TagCount int, --作品数量
TagCreatDate datetime,
TagTips int --访问量
)
go
alter table  UserTags add constraint pk_UserTags primary key(Id)
go

--用户在增加、修改、删除作品的时候，都要对 tags和UserTags表进行操作，更新两个表里的tag包括的作品数量（如果没有则增加tag)

--造数据到作品表
declare @Number int,@NumberTop int,@TagId int
Set @TagId = 1
Set @number = 1
Set @numberTop = 10000000

while @number<=@numberTop
begin
if @TagId = 100
      Set @TagId = 1
INSERT Poesy(Title,Tag)
      select 'title'+cast(@number as varchar), '历史'++cast(@TagId as varchar)+',铁木真,元朝历史,蒙古历史'+cast(@number as varchar)
Set @number = @number +1
Set @TagId = @TagId +1
end

--建立索引
create index Ix_poesyTag on Poesy(Tag)

--查询很慢要一分钟以上（1）
with Orderlist as (
select row_number() over(order by id desc) as rownumber,id,title
from poesy
where charindex(',蒙古历史10000,',','+tag+',')>0
) select RowNumber,Id,Title
from Orderlist
where RowNumber between 1 and 50

--查询很快（2）
with Orderlist as (
select row_number() over(order by id desc) as rownumber,id,title
from poesy
where charindex(',元朝历史,',','+tag+',')>0
) select RowNumber,Id,Title
from Orderlist
where RowNumber between 1 and 50
复制代码

--建立索引
create index Ix_poesyTag on Poesy(Tag)

查询1很慢，因为 tag里包含蒙古历史10000的记录只有一条。
查询2很快，因为tag里包括元朝历史的记录非常多

建立了索引，也没有什么变化。

方案二：增加一个文章TAG关联表。详见下：

--tag表
if object_id('TestTagName','u') is not null
drop table TestTagName
go

create table TestTagName
(
id int identity(1,1),
TagName varchar(30),
)
go
alter table  TestTagName add constraint pk_TestTagName primary key(Id)
go
--插入tag
insert TestTagName(Title)
select '铁木真' union all
select '元朝历史' union all
select '蒙古历史10000' union all
select '蒙古历史'

--文章tag表
if object_id('TestTagPoesy','u') is not null
drop table TestTagPoesy
go

create table TestTagPoesy
(
id int identity(1,1),
TagId int,
poesyid int
)
go
alter table  TestTagPoesy add constraint pk_TestTagPoesy primary key(Id)
go

--增加测试数据到文章tag表

insert TestTagPoesy(TagId,poesyid)
select 1,id
from poesy

insert TestTagPoesy(TagId,poesyid)
select 2,id
from poesy

insert TestTagPoesy(TagId,poesyid)
select 3,id
from poesy
where id= 10000

--索引：
create index ix_TestTagPoesy_poesyid on TestTagPoesy(poesyid)
create index ix_TestTagPoesy_tagid on TestTagPoesy(tagid)
create index ix_TestTagName_title on TestTagName(TagName)

--查询tag是蒙古历史10000的文章
with Orderlist as (
select row_number() over(order by a.id desc) as rownumber,a.id,a.title
from poesy a  inner join TestTagPoesy as b on a.id = b.poesyid
inner join TestTagName as c on b.tagid = c.id
where c.TagName ='蒙古历史10000'
) select RowNumber,Id,Title
from Orderlist
where RowNumber between 1 and 50

--查询tag是蒙古历史10000的文章
with Orderlist as (
select row_number() over(order by a.id desc) as rownumber,a.id,a.title
from poesy a  inner join TestTagPoesy as b on a.id = b.poesyid
inner join TestTagName as c on b.tagid = c.id
where c.TagName ='元朝历史'
) select RowNumber,Id,Title
from Orderlist
where RowNumber between 1 and 50

查询1很慢，因为 tag里包含蒙古历史10000的记录只有一条。
查询2很快，因为tag里包括元朝历史的记录非常多复制代码

三系统测试

机器配置：
内存2G
CPU core 6420 双CPU 2.13G

方案1 和方案二的查询结果是一样，速度都很慢。要一分钟以上。

（这里的测试重点是按照tag去找文章）

问题：

（1）这两个方案，大家支持哪一种？或者说这两个都不可取，可以有更好的方案。
（2）关于性能问题，这两个方案，除了提高硬件外还有别的办法提高性能吗？

帐号		自动登录	找回密码
密码			立即注册

探讨下Tag标签的数据库设计（千万级数据量）

马上注册，结交更多数据大咖，获取更多知识干货，轻松玩转大数据

站长推荐 /1