Neo4j图算法第九章介绍了在Yelp数据集上进行算法实践,今天先介绍如何将Yelp数据集导入Neo4j.
1.Yelp数据集可以在https://www.yelp.com/dataset下载,只需要填写简单的信息即可,也可以在https://pan.baidu.com/s/1n3PXAtOWqj1cS0XajZyruA下载;
2.解压后会得到如下图左侧json文件,下一步要将json文件转换为右侧csv文件;
3.json_to_csv
实际上https://github.com/mneedham/yelp-graph-algorithms/blob/master/README.adoc有介绍如何进行数据转换及导入,也开源了脚本,但我在运行这些脚本的时候总是报错,所以还是自己动手。
以下代码读取businessLocations.json,转换为area.csv、country.csv、city_IN_AREA_area.csv、area_IN_COUNTRY_country.csv。
import json
import csv
def read_json(path,filename):
f=open(path+filename,'r',encoding='utf-8')
# for line in f.readlines():
# dic=json.loads(line)
content=json.load(f,strict=False)
for item in content.items():
admin1 = item [1] ['admin1']
admin2 = item [1] ['admin2']
city = item [1] ['city']
country = item [1] ['country']
name = item [1] ['name']
with open ( "D:/share/yelp/area.json", "a+" ) as area_csv :
area_writer = csv.writer ( area_csv, escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL,
dialect='excel' )
try :
area_writer.writerow ( [admin1] )
except :
print ( "there is a error" )
continue
with open ( "D:/share/yelp/country.json", "a+" ) as country_csv :
country_writer = csv.writer ( country_csv, escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL )
try :
country_writer.writerow ( [country] )
except :
continue
with open ( "D:/share/yelp/city_IN_AREA_area.json", "a" ) as city_area_csv :
city_area_writer = csv.writer ( city_area_csv, escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL )
try :
city_area_writer.writerow ( [city, admin1] )
except :
continue
with open ( "D:/share/yelp/area_IN_COUNTRY_country.json", "a" ) as area_country_csv :
area_country_writer = csv.writer ( area_country_csv, escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL )
try :
area_country_writer.writerow ( [admin1, country] )
except :
continue
if __name__ == '__main__' :
# read json
path = 'D:/share/yelp/businessLocations.json'
read_json(path)
接下来生成以下文件。
business.csv |
category.csv |
user.csv |
review.csv |
city.csv |
business_IN_CATEGORY_category.csv |
user_FRIENDS_user.csv |
user_WROTE_review.csv |
review_REVIEWS_business.csv |
business_IN_CITY_city.csv |
import json
import csv
def read_json(path,filename):
f=open(path+filename,'r',encoding='utf-8')
unique_cities = set ( )
unique_categorys =set()
for line in f.readlines():
item=json.loads(line)
if filename =='business.json':
with open ( "D:/share/yelp/business.csv", "a+" ) as business_csv :
business_writer = csv.writer ( business_csv, escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL,
dialect='excel' )
try :
business_writer.writerow (
[item ['business_id'], item ['name'], item ['address'], item ['city'], item ['state']] )
except :
print ( "there is a error" )
continue
unique_cities.add(item["city"])
with open ( "D:/share/yelp/business_IN_CITY_city.csv", "a+" ) as business_city_writer :
business_city_writer = csv.writer ( business_city_writer, escapechar='\\', quotechar='"',
quoting=csv.QUOTE_ALL,
dialect='excel' )
try :
business_city_writer.writerow ( [item ["business_id"], item ["city"]] )
except :
print ( "there is a error" )
continue
with open ( "D:/share/yelp/business_IN_CATEGORY_category.csv", "a+" ) as business_category_csv :
business_category_writer = csv.writer ( business_category_csv, escapechar='\\', quotechar='"',
quoting=csv.QUOTE_ALL,
dialect='excel' )
try :
for category in item ["categories"].split(',') :
unique_categorys.add ( category )
business_category_writer.writerow ( [item ["business_id"], category] )
except :
print("there is a error")
continue
elif filename =='user.json':
with open ( "D:/share/yelp/user.csv", "a" ) as user_csv, \
open ( "D:/share/yelp/user_FRIENDS_user.csv", "a" ) as user_user_csv:
user_writer = csv.writer ( user_csv, escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL )
user_user_writer = csv.writer ( user_user_csv, escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL )
try :
user_writer.writerow ([item["user_id"], item["name"]])
for friend_id in item ["friends"].split(',') :
user_user_writer.writerow ( [item ["user_id"], friend_id] )
except :
continue
elif filename == 'review.json' :
with open ( "D:/share/yelp/review.csv", "a" ) as review_csv , \
open ( "D:/share/yelp/user_WROTE_review.csv", "a" ) as user_review_csv, \
open ( "D:/share/yelp/review_REVIEWS_business.csv", "a" ) as review_business_csv:
review_writer = csv.writer ( review_csv, escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL )
user_review_writer = csv.writer ( user_review_csv, escapechar='\\', quotechar='"',
quoting=csv.QUOTE_ALL )
review_business_writer = csv.writer ( review_business_csv, escapechar='\\', quotechar='"',
quoting=csv.QUOTE_ALL )
try :
review_writer.writerow ( [item ["review_id"], item ["text"], item ["stars"], item ["date"]] )
user_review_writer.writerow ( [item ["user_id"], item ["review_id"]] )
review_business_writer.writerow ( [item ["review_id"], item ["business_id"]] )
except :
continue
if filename =='business.json':
with open ( "D:/share/yelp/city.csv", "a" ) as city_csv :
city_writer = csv.writer ( city_csv, escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL )
for city in unique_cities :
city_writer.writerow ( [city] )
with open ( "D:/share/yelp/category.csv", "a+" ) as categories_csv :
category_writer = csv.writer ( categories_csv, escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL )
for category in unique_categorys :
try :
category_writer.writerow ( [category] )
except Exception as e :
print ( category )
continue
if __name__ == '__main__' :
# read json
list1=['business.json',
'review.json',
'user.json']
path = 'D:/share/yelp/'
for filename in list1:
file=path+filename
read_json(path,filename)
除以上文件外,还需要创建header文件,见import.sh文件。
代码如下:
import json
import csv
def write_header(file_name, columns):
with open(file_name, 'w') as file_csv:
writer = csv.writer(file_csv)
writer.writerow(columns)
file_csv.close()
if __name__ == '__main__' :
write_header (
"D:/share/yelp/area_header.csv",
['name:ID(Area)'] )
write_header (
"D:/share/yelp/country_header.csv",
['name:ID(Country)'] )
write_header (
"D:/share/yelp/city_IN_AREA_area_header.csv",
[':START_ID(City)', ':END_ID(Area)'] )
write_header (
"D:/share/yelp/area_IN_COUNTRY_country_header.csv",
[':START_ID(Area)', ':END_ID(Country)'] )
write_header (
"D:/share/yelp/business_header.csv",
['id:ID(Business)', 'name', 'address', 'city', 'state'] )
write_header (
"D:/share/yelp/city_header.csv",
['name:ID(City)'] )
write_header (
"D:/share/yelp/business_IN_CITY_city_header.csv",
[':START_ID(Business)', ':END_ID(City)'] )
write_header (
"D:/share/yelp/category_header.csv",
['name:ID(Category)'] )
write_header (
"D:/share/yelp/business_IN_CATEGORY_category_header.csv",
[':START_ID(Business)', ':END_ID(Category)'] )
write_header (
"D:/share/yelp/user_header.csv",
['id:ID(User)', 'name'] )
write_header (
"D:/share/yelp/user_FRIENDS_user_header.csv",
[':START_ID(User)', ':END_ID(User)'] )
write_header (
"D:/share/yelp/review_header.csv",
['id:ID(Review)', 'text', 'stars:int', 'date'] )
write_header (
"D:/share/yelp/user_WROTE_review_header.csv",
[':START_ID(User)', ':END_ID(Review)'] )
write_header (
"D:/share/yelp/review_REVIEWS_business_header.csv",
[':START_ID(Review)', ':END_ID(Business)'] )
4、所有csv文件生成以后,就可以执行导入操作了,直接执行import.sh。实际上我更加倾向于直接读取json文件将数据写入Neo4j,免得再转一道。
#!/usr/bin/env bash
export DATA=D:/share/yelp/
./bin/neo4j-admin import \
--mode=csv \
--database=yelp.db \
--nodes:Business $DATA/business_header.csv,$DATA/business.csv \
--nodes:Category $DATA/category_header.csv,$DATA/category.csv \
--nodes:User $DATA/user_header.csv,$DATA/user.csv \
--nodes:Review $DATA/review_header.csv,$DATA/review.csv \
--nodes:City $DATA/city_header.csv,$DATA/city.csv \
--nodes:Area $DATA/area_header.csv,$DATA/area.csv \
--nodes:Country $DATA/country_header.csv,$DATA/country.csv \
--relationships:IN_CATEGORY $DATA/business_IN_CATEGORY_category_header.csv,$DATA/business_IN_CATEGORY_category.csv \
--relationships:FRIENDS $DATA/user_FRIENDS_user_header.csv,$DATA/user_FRIENDS_user.csv \
--relationships:WROTE $DATA/user_WROTE_review_header.csv,$DATA/user_WROTE_review.csv \
--relationships:REVIEWS $DATA/review_REVIEWS_business_header.csv,$DATA/review_REVIEWS_business.csv \
--relationships:IN_CITY $DATA/business_IN_CITY_city_header.csv,$DATA/business_IN_CITY_city.csv \
--relationships:IN_AREA $DATA/city_IN_AREA_area_header.csv,$DATA/city_IN_AREA_area.csv \
--relationships:IN_COUNTRY $DATA/area_IN_COUNTRY_country_header.csv,$DATA/area_IN_COUNTRY_country.csv \
--ignore-missing-nodes=true \
--multiline-fields=true
5、知识图谱设计如下图
数据导入之后就可以进行调用Neo4j自带图算法进行分析。