Add in a retry system for failed images, so they get another chance to

send a request and try again if needed.
master
sigonasr2 5 years ago
parent 70503ab80a
commit ff857cc28e
  1. BIN
      Tagger/Tagger.jar
  2. 273
      Tagger/src/PixivManager.java
  3. 1
      Tagger/src/imageTag.java

Binary file not shown.

@ -58,131 +58,27 @@ public class PixivManager {
}
for (String s : imageTag.pixiv_image_list) {
//String url = "https://api.proxycrawl.com/?token=ahDRaxo3KT2OOX2nQZQV9A&url=https://www.pixiv.net/en/artworks/"+s;
String url = "http://45.33.13.215/crawler/crawler/"+s+".html";
try {
if (!new File("downloadedData/temp"+s+".html").exists()) {
System.out.println("Starting download of "+url+" ...");
utils.downloadFileFromUrl(url, "downloadedData/temp"+s+".html");
if (new File("downloadedData/temp"+s+".html").exists()) {
String[] data = utils.readFromFile("downloadedData/temp"+s+".html");
int scriptEndLine = 0;
while (scriptEndLine<data.length) {
if (data[scriptEndLine].contains("<meta name=\"preload-data\" id=\"meta-preload-data\" content='")) {
System.out.println("Found JSON Target line at line "+scriptEndLine+". :: "+data[scriptEndLine] );
break;
}
scriptEndLine++;
}
if (scriptEndLine==data.length) {
System.out.println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
System.out.println(" IMAGE "+s+" FAILED TO PARSE CORRECTLY! Something is messed up about the file!!");
System.out.println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
System.out.println("Skipping image "+s+" because webpage cannot be found.");
utils.logToFile(s+"\n", "skippedItems.txt");
} else {
File finaldata = new File("finaltemp");
FileWriter fw;
try {
fw = new FileWriter(finaldata);
BufferedWriter bw = new BufferedWriter(fw);
System.out.println(data[scriptEndLine]);
int cutpos = data[scriptEndLine].indexOf("<meta name=\"preload-data\" id=\"meta-preload-data\" content='")+58;
System.out.println(data[scriptEndLine].length()+"///"+data[scriptEndLine].indexOf("}}}'>")+"///"+cutpos);
if (cutpos<data[scriptEndLine].length()) {
bw.write(data[scriptEndLine].substring(cutpos,data[scriptEndLine].indexOf("}}}'>")+3));
System.out.println(data[scriptEndLine].substring(cutpos,data[scriptEndLine].indexOf("}}}'>")+3));
}
bw.close();
fw.close();
} catch (IOException e) {
e.printStackTrace();
}
JSONObject jsonData = utils.readJsonFromFile("finaltemp");
//System.out.println(Arrays.deepToString(JSONObject.getNames(jsonData.getJSONObject("preload"))));
//System.out.println(Arrays.deepToString(JSONObject.getNames(jsonData.getJSONObject("preload").getJSONObject("illust"))));
JSONArray tagsArray = jsonData.getJSONObject("illust").getJSONObject(s).getJSONObject("tags").getJSONArray("tags");
for (int i=0;i<tagsArray.length();i++) {
boolean hasEnglishTag=false;
JSONObject tag = tagsArray.getJSONObject(i);
String ENTag="";
String romaji="";
if (tag.has("romaji")) {
romaji = tag.getString("romaji");
}
if (tag.has("translation")) {
JSONObject translationObj = tag.getJSONObject("translation");
if (translationObj.has("en")) {
hasEnglishTag=true;
ENTag = translationObj.getString("en");
}
} else
if (tag.has("tag") /*&& romaji.length()==0 */&& tag.getString("tag").matches("[ -~]")) {
hasEnglishTag=true;
ENTag = tag.getString("tag");
}
if (ENTag.replaceAll("\\?", "").trim().length()==0) {
ENTag="";
hasEnglishTag=false;
}
boolean tagSubmitted=false;
String insertedTag="";
if (hasEnglishTag && ENTag.length()>0) {
insertedTag = ENTag;
tagSubmitted=true;
} else
if (romaji.length()>0){
insertedTag = romaji;
tagSubmitted=true;
}
//insertedTag is the tag that will be used for the image.
insertedTag = ConvertTag(insertedTag.trim().toLowerCase());
if (tagSubmitted) {
if (imageTag.tag_whitelist.size()==0 || imageTag.tag_whitelist.containsKey(insertedTag.trim().toLowerCase())) {
if (imageTag.taglist.containsKey(s)) {
List<String> tags = imageTag.taglist.get(s);
tags.add(insertedTag);
imageTag.taglist.put(s, tags);
} else {
List<String> tags = new ArrayList<String>();
tags.add(insertedTag);
imageTag.taglist.put(s,tags);
}
if (imageTag.tagCounter.containsKey(insertedTag)) {
imageTag.tagCounter.put(insertedTag,imageTag.tagCounter.get(insertedTag)+1);
} else {
imageTag.tagCounter.put(insertedTag,1);
}
}
}
}
String taglist = s+": <"+imageTag.taglist.get(s)+">";
//System.out.println(taglist);
bwOutput.append(taglist);
bwOutput.newLine();
//jsonData.getJSONObject("preload").getJSONObject("illust").getJSONObject(s).getJSONObject("tags");
}/* else {
System.out.println("Skipping image "+s+" because webpage cannot be found.");
utils.logToFile(s+"\n", "skippedItems.txt");
}*/
} else {
System.out.println("Skipping image "+s+" because it has already been processed.");
}
}
} catch (IOException e) {
if (e instanceof FileNotFoundException) {
System.out.println("Skipping image "+s+" because webpage cannot be found.");
utils.logToFile(s, "skippedItems.txt");
} else {
AttemptDownload(bwOutput, s, true);
}
int retryAttempts=0;
final int MAXATTEMPTS = 3;
while (retryAttempts<MAXATTEMPTS) {
if (imageTag.pixiv_retry_list.size()>0) {
List<String> retryList = new ArrayList<String>();
retryList.addAll(imageTag.pixiv_retry_list);
imageTag.pixiv_retry_list.clear();
try {
Thread.sleep(10000);
} catch (InterruptedException e) {
e.printStackTrace();
}
for (String s : imageTag.pixiv_retry_list) {
System.out.println(" Retry Attempt Number "+(retryAttempts+1)+"...");
AttemptDownload(bwOutput, s, (retryAttempts<MAXATTEMPTS));
}
}
/*org.apache.commons.io.FileUtils.copyURLToFile(new URL(
url
),temp);*/
retryAttempts++;
}
/*for (String s : imageTag.taglist.keySet()) {
@ -238,6 +134,139 @@ public class PixivManager {
}
}
}
private void AttemptDownload(BufferedWriter bwOutput, String s, boolean addToRetryListOnFail) {
//String url = "https://api.proxycrawl.com/?token=ahDRaxo3KT2OOX2nQZQV9A&url=https://www.pixiv.net/en/artworks/"+s;
String url = "http://45.33.13.215/crawler/crawler/"+s+".html";
try {
if (!new File("downloadedData/temp"+s+".html").exists()) {
System.out.println("Starting download of "+url+" ...");
utils.downloadFileFromUrl(url, "downloadedData/temp"+s+".html");
if (new File("downloadedData/temp"+s+".html").exists()) {
String[] data = utils.readFromFile("downloadedData/temp"+s+".html");
int scriptEndLine = 0;
while (scriptEndLine<data.length) {
if (data[scriptEndLine].contains("<meta name=\"preload-data\" id=\"meta-preload-data\" content='")) {
System.out.println("Found JSON Target line at line "+scriptEndLine+". :: "+data[scriptEndLine] );
break;
}
scriptEndLine++;
}
if (scriptEndLine==data.length) {
System.out.println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
System.out.println(" IMAGE "+s+" FAILED TO PARSE CORRECTLY! Something is messed up about the file!!");
System.out.println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
System.out.println("Skipping image "+s+" because webpage cannot be found.");
utils.logToFile(s+"\n", "skippedItems.txt");
} else {
File finaldata = new File("finaltemp");
FileWriter fw;
try {
fw = new FileWriter(finaldata);
BufferedWriter bw = new BufferedWriter(fw);
System.out.println(data[scriptEndLine]);
int cutpos = data[scriptEndLine].indexOf("<meta name=\"preload-data\" id=\"meta-preload-data\" content='")+58;
System.out.println(data[scriptEndLine].length()+"///"+data[scriptEndLine].indexOf("}}}'>")+"///"+cutpos);
if (cutpos<data[scriptEndLine].length()) {
bw.write(data[scriptEndLine].substring(cutpos,data[scriptEndLine].indexOf("}}}'>")+3));
System.out.println(data[scriptEndLine].substring(cutpos,data[scriptEndLine].indexOf("}}}'>")+3));
}
bw.close();
fw.close();
} catch (IOException e) {
e.printStackTrace();
}
JSONObject jsonData = utils.readJsonFromFile("finaltemp");
//System.out.println(Arrays.deepToString(JSONObject.getNames(jsonData.getJSONObject("preload"))));
//System.out.println(Arrays.deepToString(JSONObject.getNames(jsonData.getJSONObject("preload").getJSONObject("illust"))));
JSONArray tagsArray = jsonData.getJSONObject("illust").getJSONObject(s).getJSONObject("tags").getJSONArray("tags");
for (int i=0;i<tagsArray.length();i++) {
boolean hasEnglishTag=false;
JSONObject tag = tagsArray.getJSONObject(i);
String ENTag="";
String romaji="";
if (tag.has("romaji")) {
romaji = tag.getString("romaji");
}
if (tag.has("translation")) {
JSONObject translationObj = tag.getJSONObject("translation");
if (translationObj.has("en")) {
hasEnglishTag=true;
ENTag = translationObj.getString("en");
}
} else
if (tag.has("tag") /*&& romaji.length()==0 */&& tag.getString("tag").matches("[ -~]")) {
hasEnglishTag=true;
ENTag = tag.getString("tag");
}
if (ENTag.replaceAll("\\?", "").trim().length()==0) {
ENTag="";
hasEnglishTag=false;
}
boolean tagSubmitted=false;
String insertedTag="";
if (hasEnglishTag && ENTag.length()>0) {
insertedTag = ENTag;
tagSubmitted=true;
} else
if (romaji.length()>0){
insertedTag = romaji;
tagSubmitted=true;
}
//insertedTag is the tag that will be used for the image.
insertedTag = ConvertTag(insertedTag.trim().toLowerCase());
if (tagSubmitted) {
if (imageTag.tag_whitelist.size()==0 || imageTag.tag_whitelist.containsKey(insertedTag.trim().toLowerCase())) {
if (imageTag.taglist.containsKey(s)) {
List<String> tags = imageTag.taglist.get(s);
tags.add(insertedTag);
imageTag.taglist.put(s, tags);
} else {
List<String> tags = new ArrayList<String>();
tags.add(insertedTag);
imageTag.taglist.put(s,tags);
}
if (imageTag.tagCounter.containsKey(insertedTag)) {
imageTag.tagCounter.put(insertedTag,imageTag.tagCounter.get(insertedTag)+1);
} else {
imageTag.tagCounter.put(insertedTag,1);
}
}
}
}
String taglist = s+": <"+imageTag.taglist.get(s)+">";
//System.out.println(taglist);
bwOutput.append(taglist);
bwOutput.newLine();
//jsonData.getJSONObject("preload").getJSONObject("illust").getJSONObject(s).getJSONObject("tags");
}/* else {
System.out.println("Skipping image "+s+" because webpage cannot be found.");
utils.logToFile(s+"\n", "skippedItems.txt");
}*/
} else {
System.out.println("Skipping image "+s+" because the server couldn't find it. Will retry it later...");
if (addToRetryListOnFail) {
imageTag.pixiv_retry_list.add(s);
}
//System.out.println("Skipping image "+s+" because it has already been processed.");
}
}
} catch (IOException e) {
if (e instanceof FileNotFoundException) {
System.out.println("Skipping image "+s+" because webpage cannot be found.");
utils.logToFile(s, "skippedItems.txt");
} else {
e.printStackTrace();
}
}
/*org.apache.commons.io.FileUtils.copyURLToFile(new URL(
url
),temp);*/
}
private String ConvertTag(String insertedTag) {

@ -26,6 +26,7 @@ public class imageTag {
public static Filters filters;
public static HashMap<String,Boolean> tag_whitelist = new HashMap<String,Boolean>();
public static List<String> pixiv_image_list = new ArrayList<String>();
public static List<String> pixiv_retry_list = new ArrayList<String>();
public static List<File> pixiv_rawimage_list = new ArrayList<File>();
public static HashMap<String,List<String>> taglist = new HashMap<String,List<String>>();
public static HashMap<String,String> subtaglist = new HashMap<String,String>();

Loading…
Cancel
Save